-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrix_tools.py
143 lines (116 loc) · 4.63 KB
/
metrix_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def get_clean_url(link: str):
fb_link_formats = ['facebook.com', 'fb.com']
for item in fb_link_formats:
if item in link:
url = urlopen(link, context=certificate)
page = BeautifulSoup(url.read(), 'lxml')
#sleep(0.1)
links = page.find_all('link', href=True)
link_list = []
for item in links:
if 'www.facebook.com' in item['href']:
link_list.append(item['href'])
if len(link_list) == 0:
return link
if 'login/web' in link_list[0]:
return link
else:
return link_list[0]
else:
return link
def map_ig_shortcode(link: str) -> str:
try:
if 'instagram.com' in link:
shortcode = link.split('/')[-1]
if '#advertiser' or 'reel' in shortcode:
return link.split('/')[-2]
elif 'igshid=' in shortcode:
return link.split('/')[-3]
elif len(shortcode) < 10:
return link.split('/')[-2]
elif len(shortcode) != 0 and 'copy_link' not in shortcode:
return shortcode
else:
return link.split('/')[-2]
else:
return np.nan
except:
return np.nan
def map_post_id(link: str):
try:
if 'twitter.com' in link:
return str(link.split('/')[-1].split('?')[0])
elif 'tiktok.com' in link:
try:
return str(int(link.split('/')[-1]))
except:
return str(link.split('/')[-1].split('?')[0])
elif 'facebook.com' in link:
try:
post_id = int(link.split('/')[-1])
except:
if '?dco_ad_id' in link:
return str(int(link.split('?dco_ad_id')[0].split('/')[-1]))
else:
try:
post_id = int(link.split('/')[-2])
except:
try:
post_id = int(link.split('?s=')[-2].split('/')[-1])
except:
post_id = 0
return str(post_id).rstrip('.0')
elif 'fb.watch' in link:
url = urlopen(link, context=certificate)
page = BeautifulSoup(url.read(), 'lxml')
try:
pattern = re.compile(r'\"video_id\":\"(.*?)\"')
post_id = re.search(pattern, str(page)).group(1)
except AttributeError:
pattern = re.compile(r'\"v\":\"(.*?)\"')
post_id = re.search(pattern, str(page)).group(1)
return str(post_id).rstrip('.0')
elif 'instagram.com' in link and GET_IG_POST_IDS:
shortcode = map_ig_shortcode(link)
url = urlopen(f'https://www.instagram.com/p/{shortcode}/', context=certificate)
page = BeautifulSoup(url.read(), 'lxml')
pattern = re.compile(r"\"media_id\":\"(.*?)\"")
post_id = int(re.search(pattern, str(page)).group(1))
return post_id
elif 'youtube.com' in link:
if 'shorts' in link:
return link.split('/')[-1]
else:
return link.split('?v=')[-1].split('&v=')[-1]
elif 'youtu.be' in link:
return link.split('/')[-1].split('?t=')[0]
else:
return np.nan
except:
return np.nan
def get_fb_page_id(link):
url = urlopen(link, context=certificate)
page = BeautifulSoup(url.read(), 'lxml')
page_id_pattern = re.compile(r"PageID\(\"(.*?)-")
page_id = re.search(page_id_pattern, str(page))
if page_id == None:
page_id_pattern = re.compile(r"\"pageLoadEventId\":\"(.*?)\"")
page_id = re.search(page_id_pattern, str(page))
return int(page_id.group(1))
def get_ad_id(link):
ad_id = np.nan
if 'facebook.com' in link:
if '?dco_ad_id=' in link:
try:
ad_id = int(link.split('?dco_ad_id=')[-1])
except:
ad_id = int(link.split('?dco_ad_id=')[-1].split('&dco_ad_token')[0])
return ad_id
def deduplicate_links(dataframe):
all_dedup = pd.DataFrame()
for id_type in ['ad_id', 'post_id', 'shortcode']:
dedup = dataframe[~(dataframe[id_type].duplicated()) |
(dataframe[id_type].isnull())]
all_dedup = pd.concat([all_dedup, dedup])
all_dedup = all_dedup.drop_duplicates(subset='url')
return all_dedup