-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl.py
159 lines (146 loc) · 8.48 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
##### How to use the Script #######
#
#
# nohup python crawl.py {keyword} {days} &
#
# ### OUTDATED
# Bug : On headless browser, namely PhantomJS (which has no GUI, runs in the background),
# is not able to go back beyond 10 days or so, whereas with firefox webdriver
# (which opens up a firefox gui) we can go beyond as much as we like, currently it is tested for 25 days.
#
# ### UPDATE : 15-12-2019
# Due to Twitter's limit, only 10 days worth of scrolling can be done.
# Check with reading the data back form the saved file, there might be errors due to encodings
#####################################
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys
import time
import dateparser
from datetime import datetime
import time, re
from bs4 import BeautifulSoup as BS
import json
import logging
class Sel():
def __init__(self):
self.driver = webdriver.Firefox()
# self.driver = webdriver.PhantomJS()
# self.driver.set_window_size(1120, 550)
# self.driver.implicitly_wait(30) #### commention as we are using
self.base_url = "https://twitter.com"
self.verificationErrors = []
self.accept_next_alert = True
logging.info("Initialization Completed")
def __del__(self):
self.driver.close()
self.driver.quit()
logging.info("Cleaned and Deleted the Webdriver Instance")
def crawlScroll(self, keyword, days, outputFileName):
retrievalDays = int(days)
url = "/search?q=" + keyword + "&src=typd"
self.driver.get(self.base_url + url)
wait = WebDriverWait(self.driver, 30) ### 30 sec timeout for Ajax to load
currentTime = datetime.now()
loopingContinue = True
tweetMap = {} #### container for the tweet ids, checking for unique ids each time we scroll.
logging.info("Starting to Scroll ... ")
pageEndCheck = 0
while(loopingContinue):
html = self.driver.page_source
soup = BS(html, "lxml")
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
orderedList = soup.find_all("ol", {"class": "stream-items js-navigable-stream"})
for tweetOL in orderedList:
# tweetList = tweetOL.find_all('li', {"class": "js-stream-item stream-item stream-item expanding-stream-item\n"})
tweetList = tweetOL.find_all('li', {"class": "js-stream-item stream-item stream-item"})
if len(tweetList) == 0:
print("No Data for this keyword !!!")
return
for tweetElement in tweetList:
tweetData = {} #### container for the tweet data, renewed for every tweet
innerDivElementList = tweetElement.find_all('div')
for innerDiv in innerDivElementList:
if innerDiv.has_attr('data-name'):
tweetData['Tweet Id'] = innerDiv.get('data-tweet-id').encode('utf-8')
if innerDiv.get('data-tweet-id').encode('utf-8') in tweetMap:
tweetMap[innerDiv.get('data-tweet-id').encode('utf-8')] += 1
else:
tweetMap[innerDiv.get('data-tweet-id').encode('utf-8')] = 1
tweetData['User Screen Name'] = innerDiv.get('data-name').encode('utf-8')
tweetData['User Login Name'] = innerDiv.get('data-screen-name').encode('utf-8')
tweetData['User Id'] = innerDiv.get('data-user-id').encode('utf-8')
if innerDiv.has_attr('data-mentions'):
tweetData['Users Referenced'] = innerDiv.get('data-mentions').encode('utf-8')
else:
tweetData['Users Referenced'] = ' '
if innerDiv.has_attr('class') and ' '.join(i for i in innerDiv['class'] if i.strip() != '') == 'js-tweet-text-container':
tweetData['Tweet'] = innerDiv.get_text().strip().encode('utf-8')
if innerDiv.has_attr('class') and ' '.join(i for i in innerDiv['class'] if i.strip() != '') == 'ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt':
if len((innerDiv.get_text().strip()).split()) > 2:
tweetData['Retweets'] = (innerDiv.get_text().strip().encode('utf-8')).split()[1]
else :
tweetData['Retweets'] = '0'
if innerDiv.has_attr('class') and ' '.join(i for i in innerDiv['class'] if i.strip() != '') == 'ProfileTweet-action ProfileTweet-action--favorite js-toggleState':
if len((innerDiv.get_text().strip()).split()) > 2:
tweetData['Likes'] = (innerDiv.get_text().strip().encode('utf-8')).split()[1]
else :
tweetData['Retweets'] = '0'
innerAElementList = tweetElement.find_all('a')
for innerA in innerAElementList:
if innerA.has_attr('title') and innerA.has_attr('class'):
if ' '.join(i for i in innerA['class'] if i.strip() != '') == 'tweet-timestamp js-permalink js-nav js-tooltip':
timeStamp = ','.join([innerA.get('title').split('-')[1].strip(),innerA.get('title').split('-')[0].strip()])
tweetData['Time Stamp'] = dateparser.parse(timeStamp).strftime("%Y-%m-%d %H:%M:%S")
if (currentTime - dateparser.parse(timeStamp)).days > retrievalDays:
loopingContinue = False
if tweetData:
if tweetData['Tweet Id'] in tweetMap:
if tweetMap[tweetData['Tweet Id']] == 1:
data = {str(k): str(v) for k, v in tweetData.items()}
with open(outputFileName, 'a') as f:
# json.dump(tweetData, f, ensure_ascii=False)
# logging.info("Data : %s", json.dumps(tweetData, ensure_ascii=False, encoding='utf8'))
json.dump(data, f)
logging.info("Data : %s", json.dumps(data))
f.write('\n')
if any(v == 1 for v in tweetMap.values()):
pageEndCheck = 0
elif pageEndCheck == 20:
loopingContinue = False
logging.info("No more results on the Search Page....")
logging.info("Terminating the Scrolling.")
return
else:
logging.info("The scroll returned the same page")
pageEndCheck += 1
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# time.sleep(10)
try:
wait.until(lambda driver: self.driver.execute_script("return jQuery.active == 0"))
time.sleep(5)
except Exception as e:
logging.exception("Error Timed-Out waiting for the AJAX response : %s",e)
logging.info("Reloading the complete search page then restarting the Crawling")
self.driver.get(self.base_url + url)
logging.info("Page reloaded, Begining to scroll")
time.sleep(10)
return
def main(argv):
keyword = argv[0]
days = int(argv[1])
outputFileName = keyword+'-Data.log'
selenium = Sel()
selenium.crawlScroll(keyword, days, outputFileName)
if __name__ == "__main__":
loggingFileName = 'Runtime-TwitterCrawler-'+str(sys.argv[1])+'.log'
logging.basicConfig(level=logging.INFO, filename=loggingFileName , format='[%(levelname)s] (%(threadName)-10s) %(asctime)s : %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
main(sys.argv[1:])
logging.info("Exting the code")