Skip to content

Commit 3386ac4

Browse files
committed
fix base_spider && enhance
- base_spider.py: fix an oom issue caused by empty strings produced by `img_re.findall()`. check for empty string before string replacing. - base_spider.py: add image url validation to prevent potential bugs. - base_spider.py: change `open()` in `save_item()` from synchronous to `anyio.open_file()`. - spiders: set timeout in aiohttp client. - optimize images downloading. - add __main__.py in python package
1 parent 5b169a9 commit 3386ac4

File tree

6 files changed

+48
-33
lines changed

6 files changed

+48
-33
lines changed

pyproject.toml

+1-2
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ dependencies = [
99
"loguru>=0.7.2",
1010
"parsel>=1.9.1",
1111
"markdownify>=0.13.1",
12-
"aiofiles>=24.1.0",
12+
"anyio>=4.4.0",
1313
]
1414
readme = "README.md"
1515
requires-python = ">= 3.8"
@@ -33,4 +33,3 @@ allow-direct-references = true
3333

3434
[tool.hatch.build.targets.wheel]
3535
packages = ["src/black_fish"]
36-

src/black_fish/__main__.py

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from . import main
2+
3+
if __name__ == "__main__":
4+
main()

src/black_fish/base_spider.py

+29-4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import asyncio
88
import re
99
import os
10+
import anyio
1011
import json
1112
from loguru import logger
1213

@@ -89,8 +90,9 @@ async def save_item(self, content: Union[str, bytes], file_name: str, is_img: bo
8990
save_to_item_path = f"{save_to_item_path}/img"
9091
os.makedirs(save_to_item_path, exist_ok=True)
9192

92-
with open(f"{save_to_item_path}/{file_name}", "w" if isinstance(content, str) else "wb") as f:
93-
f.write(content)
93+
async with await anyio.open_file(f"{save_to_item_path}/{file_name}",
94+
mode="w" if isinstance(content, str) else "wb") as f:
95+
await f.write(content)
9496

9597
async def save_index(self):
9698
index_path = f"{self.save_diretory}/{self.source}/index"
@@ -163,13 +165,32 @@ async def parse_and_store_article(self, html_content, preview_article: ArticlePr
163165
remote_imgs = img_re.findall(article_content_markdown)
164166

165167
for url in remote_imgs:
168+
if url == '': continue
166169
local_img_url = "img/" + hashlib.sha256(f"{url}".encode()).hexdigest() + "." + url.split(".")[-1]
167-
article_content_markdown = article_content_markdown.replace(url, local_img_url)
170+
article_content_markdown = article_content_markdown.replace(url, local_img_url, 1)
168171

169172
title = preview_article.title
170173
await self.save_item(article_content_markdown, f"{title}.md")
171174

172-
return remote_imgs
175+
img_sem = asyncio.Semaphore(5)
176+
fetch_img_tasks = []
177+
for img_url in remote_imgs:
178+
img_url = img_url.split('#')[0]
179+
if not self.img_url_verify(img_url): continue
180+
img_name = hashlib.sha256(f"{img_url}".encode()).hexdigest() + "." + img_url.split(".")[-1]
181+
task = asyncio.create_task(
182+
self.parallel_fetch_source(
183+
img_url,
184+
callback=self.fetch_and_store_img,
185+
is_bytes=True,
186+
fresh_sem=img_sem,
187+
# pass img_name to callback
188+
img_name=img_name,
189+
ignore_exception=True
190+
)
191+
)
192+
fetch_img_tasks.append(task)
193+
await asyncio.gather(*fetch_img_tasks)
173194

174195
async def parallel_fetch_source(self, uri: str, callback = None, ignore_exception = False, fresh_sem: Union[None, Semaphore] = None, is_bytes=False, **kwargs):
175196
sem = fresh_sem if fresh_sem is not None else self.fetch_limit_sem
@@ -196,3 +217,7 @@ async def fetch_and_store_img(self, img: bytes, img_name: str):
196217

197218
async def close(self):
198219
await self.client.close()
220+
221+
@abstractmethod
222+
def img_url_verify(self, url: str) -> bool:
223+
...

src/black_fish/executor.py

+1-23
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import List
22
from loguru import logger
33
import asyncio
4-
import hashlib
54

65
from .base_spider import BaseArticleSpider
76
from .config import Config
@@ -35,28 +34,7 @@ async def run_spider(self, spider: BaseArticleSpider):
3534
logger.info("{} spider find {} fresh articles", spider.source, len(fresh_articles))
3635

3736
spider.article_preview = fresh_articles
38-
full_article_imgs = await spider.fetch_remote_full_articles(fresh_articles)
39-
40-
remote_imgs = [img for img_list in full_article_imgs for img in img_list]
41-
remote_imgs: List[str] = list(set(remote_imgs))
42-
logger.info("{} spider fetch {} images", spider.source, len(remote_imgs))
43-
44-
img_sem = asyncio.Semaphore(10)
45-
fetch_img_tasks = []
46-
for img_url in remote_imgs:
47-
img_name = hashlib.sha256(f"{img_url}".encode()).hexdigest() + "." + img_url.split(".")[-1]
48-
task = asyncio.create_task(
49-
spider.parallel_fetch_source(
50-
img_url,
51-
callback=spider.fetch_and_store_img,
52-
is_bytes=True,
53-
fresh_sem=img_sem,
54-
# pass img_name to callback
55-
img_name=img_name
56-
)
57-
)
58-
fetch_img_tasks.append(task)
59-
await asyncio.gather(*fetch_img_tasks)
37+
await spider.fetch_remote_full_articles(fresh_articles)
6038

6139
logger.info("{} spider save index", spider.source)
6240
await spider.save_index()

src/black_fish/spiders/tttang.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
from ..base_spider import BaseArticleSpider, ArticlePreview
88

9-
109
BASE_URL = "https://tttang.com"
1110

1211
class TTTangSpider(BaseArticleSpider):
@@ -18,6 +17,7 @@ def __init__(self):
1817
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
1918
"Referer": "https://tttang.com/"
2019
},
20+
timeout=aiohttp.ClientTimeout(8)
2121
)
2222

2323
async def prepare_for_run(self):

src/black_fish/spiders/xz.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
from parsel import Selector
44
from loguru import logger
55
from markdownify import markdownify as md
6+
from urllib.parse import urlparse
7+
from os import path
68

79
from ..base_spider import BaseArticleSpider, ArticlePreview
810

9-
1011
BASE_URL = "https://xz.aliyun.com"
1112

1213
class XZSpider(BaseArticleSpider):
@@ -21,7 +22,8 @@ def __init__(self):
2122
cookies={
2223
"acw_tc": "1a0c380917221828212171387e0037facf793235036fdff4cea42cedeca93f",
2324
"acw_sc__v3": "66a66cab88822a5511a46d6925e14924fbf9a664"
24-
}
25+
},
26+
timeout=aiohttp.ClientTimeout(8)
2527
)
2628

2729
async def prepare_for_run(self):
@@ -70,4 +72,11 @@ async def parse_preview_articles_on_one_page(self, text):
7072
# current page and next page have cached
7173
if article_hash in self.article_preview_local.keys():
7274
self.find_local_cache = True
73-
self.add_article_preview(title=title, url=url, publish_at=time)
75+
self.add_article_preview(title=title, url=url, publish_at=time)
76+
77+
def img_url_verify(self, url: str) -> bool:
78+
url_ = urlparse(url)
79+
if url_.scheme not in ("http", "https"): return False
80+
if path.splitext(url_.path)[1] not in ['.jpg', '.png', '.gif', '.webp', '.jpeg']:
81+
return False
82+
return True

0 commit comments

Comments
 (0)