-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtwitter_scraper.py
121 lines (111 loc) · 5.62 KB
/
twitter_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import requests, datetime, time
import argparse
class TwitterScraper:
API_HEADERS = {
'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
'x-csrf-token': '0'
}
def __init__(self, hashtag):
self.hashtag = hashtag
def set_params(self, scroll_value):
self.params = (
('include_profile_interstitial_type', '1'),
('include_blocking', '1'),
('include_blocked_by', '1'),
('include_followed_by', '1'),
('include_want_retweets', '1'),
('include_mute_edge', '1'),
('include_can_dm', '1'),
('include_can_media_tag', '1'),
('skip_status', '1'),
('cards_platform', 'Web-12'),
('include_cards', '1'),
('include_ext_alt_text', 'true'),
('include_quote_count', 'true'),
('include_reply_count', '1'),
('tweet_mode', 'extended'),
('include_entities', 'true'),
('include_user_entities', 'true'),
('include_ext_media_color', 'true'),
('include_ext_media_availability', 'true'),
('send_error_codes', 'true'),
('simple_quoted_tweet', 'true'),
('q', self.hashtag),
('tweet_search_mode', 'live'),
('count', '20'),
('query_source', 'recent_search_click'),
('cursor', scroll_value),
('pc', '1'),
('spelling_corrections', '1'),
('ext', 'mediaStats,highlightedLabel'),
)
# Thanks Todd Birchard for his awesome json extraction method
# https://hackersandslackers.com/extract-data-from-complex-json-python/
def json_extract(self, obj, key):
"""Recursively fetch values from nested JSON."""
arr = []
def extract(obj, arr, key):
"""Recursively search for values of key in JSON tree."""
if isinstance(obj, dict):
for k, v in obj.items():
if isinstance(v, (dict, list)):
extract(v, arr, key)
elif k == key:
arr.append(v)
elif isinstance(obj, list):
for item in obj:
extract(item, arr, key)
return arr
values = extract(obj, arr, key)
return values
def get_xguest_token(self):
guest_token_header = {
'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
}
r = requests.post("https://api.twitter.com/1.1/guest/activate.json", headers=guest_token_header)
return r.json()["guest_token"]
def list_reviews(self):
print('---------------------------------Tweets Start----------------------------------------------')
self.set_params("")
#We need a guest token to search tweets without having a twitter account.
#Then we'll add that guest token to the header of search request
self.API_HEADERS["x-guest-token"] = self.get_xguest_token()
#Initate tweets to an empty list
tweets = {"globalObjects": {}}
while (len(tweets) > 0):
try:
response = requests.get('https://twitter.com/i/api/2/search/adaptive.json', headers=self.API_HEADERS,
params=self.params)
#The tweets are located in ["globalObjects"]["tweets"] key
tweets = response.json()["globalObjects"]["tweets"]
#I'll also get the screen name of the user who posted that tweet. This is located in ["globalObjects"]["users"] key
users = response.json()["globalObjects"]["users"]
#Twitter generates scroll values on the fly and it's located in a key called "value".
#Therefore we need to search this value until we find it.
cursor_scroll_value = self.json_extract(response.json(), 'value')[1]
#set this value in the params for request
self.set_params(cursor_scroll_value)
for tweet in tweets:
user_id = tweets[tweet]["user_id"]
#Get formatted tweet date
tweet_date = datetime.datetime.strftime(
datetime.datetime.strptime(tweets[tweet]["created_at"], '%a %b %d %H:%M:%S +0000 %Y'),
'%Y-%m-%d')
#Print tweets
print("Account Name:", users[str(user_id)]["screen_name"], "Tweet:", tweets[tweet]["user_id"],
tweets[tweet]["full_text"], "Tweet Date:", tweet_date)
except:
#If anything goes wrong, I'll wait for 10 secs and get a new guest token
time.sleep(10)
self.API_HEADERS["x-guest-token"] = self.get_xguest_token()
print('---------------------------------Tweets End-------------------------------------------------')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Search for a hashtag')
parser.add_argument('--hashtag', required=True,
help='Please enter a hashtag that starts with # symbol')
args = parser.parse_args()
hashtag = args.hashtag
ts = TwitterScraper(hashtag)
ts.list_reviews()