-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworksheet_collector.py
executable file
·40 lines (25 loc) · 1.04 KB
/
worksheet_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from parser import make_soup
from scrapers.scraper import get_url
def get_last_pagination_num(url):
soup = make_soup(url)
# grab all of the pagination a elements and grab the last in the list
last_page = int((soup.select('a[title*="Page "]'))[-1].string)
return last_page
def get_item_urls(content, selector, limit=0):
item_urls = set()
last_page_num = get_last_pagination_num(get_url(content=content))
if limit > 0:
last_page_num = limit
# loop through all the available pages starting at the first pagination url
end_range = last_page_num + 1
for i in range(1, end_range):
# todo debugging code:
print('page ' + str(i) + ' of ' + str(last_page_num))
soup = make_soup(get_url(content=content, query_string='?page=' + str(i)))
# grab all links with the data-type worksheet
work_sheet_links = soup.select(selector)
for j in work_sheet_links:
item_urls.add(j.attrs['href'])
#todo debugging code:
print(item_urls)
return item_urls