forked from songquanpeng/zhihu-archiver
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate.py
139 lines (124 loc) · 4.62 KB
/
update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import datetime
import os
from shutil import copyfile
import requests
from bs4 import BeautifulSoup
base_path = "docs"
def fetcher():
res = requests.get("https://www.zhihu.com/hot", headers={
"authority": "www.zhihu.com",
"user-agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Mobile Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
})
if res.status_code != 200:
print("Failed to fetch html.")
exit(-1)
html = res.text
soup = BeautifulSoup(html, 'html.parser')
main_tag = soup.findAll('main')[0]
list_tag = None
for child in main_tag.children:
if child.name == 'div':
list_tag = child
break
if list_tag is None:
print("Failed to find list tag.")
exit(-1)
results = []
for item in list_tag.children:
try:
if item.name != "a":
continue
link = item.attrs['href']
texts = []
is_second_div = False
for tag in item.children:
if tag.name == 'div':
if is_second_div:
for sub_tag in tag.children:
if sub_tag.name in ['div', 'h1']:
texts.append(sub_tag.text)
else:
is_second_div = True
if len(texts) == 2:
result = {
"link": link,
"title": texts[0],
"description": "",
"hot": texts[1]
}
else:
result = {
"link": link,
"title": texts[0],
"description": texts[1],
"hot": texts[2]
}
except Exception as e:
result = {
"link": "",
"title": "Error",
"description": str(e),
"hot": ""
}
print(e)
results.append(result)
return results
def write_content(data, title, time, path):
with open(path, 'w', encoding='utf-8') as f:
f.write(f"# {title}\n\n")
f.write(f"抓取于:`{time}`\n\n")
for i, item in enumerate(data):
link = item['link']
title = item['title']
description = item['description']
hot = item['hot']
f.write(f"### {i + 1}. {title}\n")
f.write(f"{hot} 链接:[{link}]({link})\n\n")
f.write(f"{description}\n\n")
def build_toc():
with open(os.path.join(base_path, 'toc.md'), 'w', encoding='utf-8') as f:
f.write(f"* [介绍](/)\n")
paths = os.listdir(base_path)
paths.sort()
for path in paths:
full_path = os.path.join(base_path, path)
if not os.path.isdir(full_path):
continue
year, month = path.split('-')
f.write(f"* [{year} 年 {month} 月]({path}/)\n")
sub_files = os.listdir(full_path)
sub_files.sort()
for file in sub_files:
if file == 'README.md':
continue
day = file.split('.')[0]
f.write(f" * [{month} 月 {day} 日]({path}/{file})\n")
def update_chapter(chapter_str):
with open(os.path.join(base_path, chapter_str, "README.md"), 'w', encoding='utf-8') as f:
year, month = chapter_str.split('-')
f.write(f"# {year} 年 {month} 月\n\n")
paths = os.listdir(os.path.join(base_path, chapter_str))
paths.sort()
for path in paths:
if path == 'README.md':
continue
day = path.split('.')[0]
f.write(f"+ [{month} 月 {day} 日的知乎热榜存档](/{chapter_str}/{day})\n")
def main():
data = fetcher()
time = datetime.datetime.now()
year, month, day = time.year, time.month, time.day
time_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
chapter_str = f"{year}-{month}"
page_str = f"{day:02}.md"
title = f"{year} 年 {month} 月 {day} 日的知乎热榜存档"
if not os.path.exists(os.path.join(base_path, chapter_str)):
os.mkdir(os.path.join(base_path, chapter_str))
file_path = os.path.join(base_path, chapter_str, page_str)
write_content(data, title, time_str, file_path)
update_chapter(chapter_str)
copyfile("./README.md", "./docs/README.md")
build_toc()
if __name__ == '__main__':
main()