-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvital_articles.py
306 lines (260 loc) · 16 KB
/
vital_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
"""Module to scrape Wikipedia's 'vital articles' and store them in JSON files."""
from bs4 import BeautifulSoup
import json
import requests
from typing import Dict, List, Optional
STORAGE_LINK = "cache/vital_articles"
CATEGORIES = [
"People",
"History",
"Geography",
"Arts",
"Philosophy and religion",
"Everyday life",
"Society and social sciences",
"Biological and health sciences",
"Physical sciences",
"Technology",
"Mathematics"
]
LEVEL_1_VITAL_ARTICLE_URLS = [f"{STORAGE_LINK}/lvl1_vital_articles.json"]
LEVEL_2_VITAL_ARTICLE_URLS = [f"{STORAGE_LINK}/lvl2_vital_articles.json"]
LEVEL_3_VITAL_ARTICLE_URLS = [f"{STORAGE_LINK}/lvl3_vital_articles.json"]
LEVEL_4_VITAL_ARTICLE_URLS = [
f"{STORAGE_LINK}/lvl4_vital_articles/people.json",
f"{STORAGE_LINK}/lvl4_vital_articles/history.json",
f"{STORAGE_LINK}/lvl4_vital_articles/geography.json",
f"{STORAGE_LINK}/lvl4_vital_articles/arts.json",
f"{STORAGE_LINK}/lvl4_vital_articles/philosophy_and_religion.json",
f"{STORAGE_LINK}/lvl4_vital_articles/everyday_life.json",
f"{STORAGE_LINK}/lvl4_vital_articles/society_and_social_sciences.json",
f"{STORAGE_LINK}/lvl4_vital_articles/biological_and_health_sciences.json",
f"{STORAGE_LINK}/lvl4_vital_articles/physical_sciences.json",
f"{STORAGE_LINK}/lvl4_vital_articles/technology.json",
f"{STORAGE_LINK}/lvl4_vital_articles/mathematics.json"
]
LEVEL_5_VITAL_ARTICLE_URLS = [
f"{STORAGE_LINK}/lvl5_vital_articles/people/writers_and_journalists.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/artists_musicians_and_composers.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/entertainers_directors_producers_and_screenwriters.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/philosophers_historians_and_social_scientists.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/religious_figures.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/politicians_and_leaders.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/military_personnel_revolutionaries_and_activists.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/scientists_inventors_and_mathematicians.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/sports_figures.json",
f"{STORAGE_LINK}/lvl5_vital_articles/people/miscellaneous.json",
f"{STORAGE_LINK}/lvl5_vital_articles/history/history.json",
f"{STORAGE_LINK}/lvl5_vital_articles/geography/physical_geography.json",
f"{STORAGE_LINK}/lvl5_vital_articles/geography/countries_and_subdivisions.json",
f"{STORAGE_LINK}/lvl5_vital_articles/geography/cities.json",
f"{STORAGE_LINK}/lvl5_vital_articles/arts/arts.json",
f"{STORAGE_LINK}/lvl5_vital_articles/philosophy_and_religion/philosophy_and_religion.json",
f"{STORAGE_LINK}/lvl5_vital_articles/everyday_life/everyday_life.json",
f"{STORAGE_LINK}/lvl5_vital_articles/everyday_life/sports_games_and_recreation.json",
f"{STORAGE_LINK}/lvl5_vital_articles/society_and_social_sciences/social_studies.json",
f"{STORAGE_LINK}/lvl5_vital_articles/society_and_social_sciences/politics_and_economics.json",
f"{STORAGE_LINK}/lvl5_vital_articles/society_and_social_sciences/culture.json",
f"{STORAGE_LINK}/lvl5_vital_articles/biological_and_health_sciences/biology_biochemistry_anatomy_and_physiology.json",
f"{STORAGE_LINK}/lvl5_vital_articles/biological_and_health_sciences/animals.json",
f"{STORAGE_LINK}/lvl5_vital_articles/biological_and_health_sciences/plants_fungi_and_other_organisms.json",
f"{STORAGE_LINK}/lvl5_vital_articles/biological_and_health_sciences/health_medicine_and_disease.json",
f"{STORAGE_LINK}/lvl5_vital_articles/physical_sciences/basics_and_measurement.json",
f"{STORAGE_LINK}/lvl5_vital_articles/physical_sciences/astronomy.json",
f"{STORAGE_LINK}/lvl5_vital_articles/physical_sciences/chemistry.json",
f"{STORAGE_LINK}/lvl5_vital_articles/physical_sciences/earth_science.json",
f"{STORAGE_LINK}/lvl5_vital_articles/physical_sciences/physics.json",
f"{STORAGE_LINK}/lvl5_vital_articles/technology/technology.json",
f"{STORAGE_LINK}/lvl5_vital_articles/mathematics/mathematics.json",
]
def parse_upper_level_list(url: str, default_level: int) -> Dict[str, int]:
"""Parse a Level 4 subsection of articles.
Parameters
----------
url: str
The URL pointing to the Wikipedia page listing a given
subcategory of Level 4 articles.
Returns
-------
Dict[str, int]
A dictionary containing the title of the article, which points
to the integer value of its level. Level 4 articles and beyond
will also store articles of lower level as they appear.
"""
articles: Dict[str, int] = {}
soup = BeautifulSoup( # get soup
requests.get(url).text, 'html.parser'
).find(
'div', attrs={"id": "mw-content-text"}
).find(
'div', attrs={"class": "mw-parser-output"}
)
for table in soup.find_all('table') + soup.find_all('div', attrs={'class': 'div-col'}) + soup.find_all('ol', recursive=False):
if table.has_attr('class') and ("mw-collapsible" in table['class'] or "navbox-subgroup" in table['class']): continue # avoid getting footer table
for li in table.find_all('li'):
title: Optional[str] = None
level: int = default_level
for a_tag in li.find_all('a'):
if not a_tag.has_attr('title'): continue
if a_tag.has_attr('class') and "image" in a_tag['class']: continue # skip article-rank icons
if a_tag.has_attr('href') and "#" in a_tag['href']: continue # skip section tags
if "Wikipedia:Vital articles" in a_tag['title']: # article has a different level
level = int(a_tag.text[-1])
continue
if title: break
if "Wikipedia:" in a_tag['title']: break # bad link
title = a_tag['title']
if not title: continue
articles[title] = level
return articles
def parse_level_four_articles() -> Dict[str, Dict[str, int]]:
"""Parse all level four vital articles, returning each subcategory and its corresponding items."""
lvl4_articles = {
"People": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/People",
"History": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/History",
"Geography": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Geography",
"Arts": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Arts",
"Philosophy and religion": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Philosophy_and_religion",
"Everyday life": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Everyday_life",
"Society and social sciences": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Society_and_social_sciences",
"Biological and health sciences": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Biology_and_health_sciences",
"Physical sciences": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Physical_sciences",
"Technology": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Technology",
"Mathematics": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/4/Mathematics"
}
all_articles: Dict[str, Dict[str, int]] = {}
for group, url in lvl4_articles.items():
print(f"Parsing {group}...")
articles = parse_upper_level_list(url, 4)
all_articles[group] = articles
return all_articles
def parse_level_five_articles() -> Dict[str, Dict[str, Dict[str, int]]]:
"""Parse all level five vital articles, returning each subcategory's subcategory, and its corresponding items."""
lvl5_articles = {
"People": {
"Writers and journalists": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Writers_and_journalists",
"Artists, musicians, and composers": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Artists,_musicians,_and_composers",
"Entertainers, directors, producers, and screenwriters": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Entertainers,_directors,_producers,_and_screenwriters",
"Philosophers, historians, and social scientists": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Philosophers,_historians,_political_and_social_scientists",
"Religious figures": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Religious_figures",
"Politicians and leaders": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Politicians_and_leaders",
"Military personnel, revolutionaries, and activists": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Military_personnel,_revolutionaries,_and_activists",
"Scientists, inventors, and mathematicians": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Scientists,_inventors,_and_mathematicians",
"Sports figures": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Sports_figures",
"Miscellaneous": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/People/Miscellaneous"
},
"History": {
"History": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/History"
},
"Geography": {
"Physical geography": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Geography/Physical",
"Countries and subdivisions": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Geography/Countries",
"Cities": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Geography/Cities"
},
"Arts": {
"Arts": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Arts"
},
"Philosophy and religion": {
"Philosophy and religion": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Philosophy_and_religion"
},
"Everyday life": {
"Everyday life": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Everyday_life",
"Sports, games and recreation": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Everyday_life/Sports,_games_and_recreation"
},
"Society and social sciences": {
"Social studies": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Society_and_social_sciences/Social_studies",
"Politics and economics": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Society_and_social_sciences/Politics_and_economics",
"Culture": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Society_and_social_sciences/Culture"
},
"Biological and health sciences": {
"Biology, biochemistry, anatomy, and physiology": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Biological_and_health_sciences/Biology",
"Animals": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Biological_and_health_sciences/Animals",
"Plants, fungi, and other organisms": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Biological_and_health_sciences/Plants",
"Health, medicine, and disease": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Biological_and_health_sciences/Health"
},
"Physical sciences": {
"Basics and measurement": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Physical_sciences/Basics_and_measurement",
"Astronomy": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Physical_sciences/Astronomy",
"Chemistry": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Physical_sciences/Chemistry",
"Earth science": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Physical_sciences/Earth_science",
"Physics": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Physical_sciences/Physics"
},
"Technology": {
"Technology": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Technology"
},
"Mathematics": {
"Mathematics": "https://en.wikipedia.org/wiki/Wikipedia:Vital_articles/Level/5/Mathematics"
}
}
all_articles: Dict[str, Dict[str, Dict[str, int]]] = {}
for group, subgroups in lvl5_articles.items():
group_article: Dict[str, Dict[str, int]] = {}
for subgroup, url in subgroups.items():
print(f"Parsing {group}/{subgroup}...")
articles: Dict[str, int] = parse_upper_level_list(url, 5)
group_article[subgroup] = articles
all_articles[group] = group_article
return all_articles
def combine_all_articles():
"""Combine all *_vital_articles.json files into one file."""
FILENAME = f"{STORAGE_LINK}/vital_articles.json"
filenames = LEVEL_1_VITAL_ARTICLE_URLS \
+ LEVEL_2_VITAL_ARTICLE_URLS \
+ LEVEL_3_VITAL_ARTICLE_URLS \
+ LEVEL_4_VITAL_ARTICLE_URLS \
+ LEVEL_5_VITAL_ARTICLE_URLS
articles = {}
# Read all
for filename in filenames:
print(f"Reading {filename}...")
raw_articles = {}
with open(f"{STORAGE_LINK}/{filename}", "r") as infile:
raw_articles = json.load(infile)
for title, level in raw_articles.items():
if title in articles.keys(): continue
articles[title] = level
# Write
with open(FILENAME, "w+") as outfile:
json.dump(articles, outfile, indent=4)
def get_vital_articles_at_level(level: int) -> Optional[List[str]]:
"""Retrieve a list of vital articles at a given level."""
if level > 5 or level < 1:
return None
urls_to_parse = []
if level == 1: # yes, this is "naive fizz-buzz"-level code, but vital article levels likely wont grow past 5
urls_to_parse = LEVEL_1_VITAL_ARTICLE_URLS
elif level == 2:
urls_to_parse = LEVEL_2_VITAL_ARTICLE_URLS
elif level == 3:
urls_to_parse = LEVEL_3_VITAL_ARTICLE_URLS
elif level == 4:
urls_to_parse = LEVEL_4_VITAL_ARTICLE_URLS
else:
urls_to_parse = LEVEL_5_VITAL_ARTICLE_URLS
article_names = []
for url in urls_to_parse:
with open(url, "r") as infile:
article_names.extend(json.load(infile).keys())
return article_names
def get_vital_articles_by_category(category: str) -> Optional[List[str]]:
"""Given the name of a vital article category, return all level <=5 article titles."""
if category not in CATEGORIES: return None
urls = [url for url in LEVEL_5_VITAL_ARTICLE_URLS if category.lower().replace(' ', '_') in url]
article_titles = []
for url in urls:
with open(url, "r") as infile:
article_titles.extend(json.load(infile).keys())
return article_titles
if __name__=='__main__':
lvl4_articles = parse_level_four_articles()
for group, articles in lvl4_articles.items():
filename = f"{STORAGE_LINK}/lvl4_vital_articles/{group.lower().replace(' ', '_')}.json"
with open(filename, "w+") as outfile:
json.dump(articles, outfile, indent=4)
lvl5_articles = parse_level_five_articles()
for group, subgroups in lvl5_articles.items():
for subgroup, articles in subgroups.items():
filename = f"{STORAGE_LINK}/lvl5_vital_articles/{group.lower().replace(' ', '_')}/{subgroup.lower().replace(',', '').replace(' ', '_')}.json"
with open(filename, "w+") as outfile:
json.dump(articles, outfile, indent=4)
combine_all_articles()