fix base_spider && enhance

Cycloctane · Cycloctane · commit 3386ac468dd4 · 2024-08-25T18:48:43.000+08:00
- base_spider.py: fix an oom issue caused by empty strings produced by `img_re.findall()`. check for empty string before string replacing.
- base_spider.py: add image url validation to prevent potential bugs.
- base_spider.py: change `open()` in `save_item()` from synchronous to `anyio.open_file()`.
- spiders: set timeout in aiohttp client.
- optimize images downloading.
- add __main__.py in python package
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ dependencies = [
     "loguru>=0.7.2",
     "parsel>=1.9.1",
     "markdownify>=0.13.1",
-    "aiofiles>=24.1.0",
+    "anyio>=4.4.0",
 ]
 readme = "README.md"
 requires-python = ">= 3.8"
@@ -33,4 +33,3 @@ allow-direct-references = true
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/black_fish"]
-
diff --git a/src/black_fish/__main__.py b/src/black_fish/__main__.py
@@ -0,0 +1,4 @@
+from . import main
+
+if __name__ == "__main__":
+    main()
diff --git a/src/black_fish/base_spider.py b/src/black_fish/base_spider.py
@@ -7,6 +7,7 @@
 import asyncio
 import re
 import os
+import anyio
 import json
 from loguru import logger
 
@@ -89,8 +90,9 @@ async def save_item(self, content: Union[str, bytes], file_name: str, is_img: bo
             save_to_item_path = f"{save_to_item_path}/img"
             os.makedirs(save_to_item_path, exist_ok=True)
 
-        with open(f"{save_to_item_path}/{file_name}", "w" if isinstance(content, str) else "wb") as f:
-            f.write(content)
+        async with await anyio.open_file(f"{save_to_item_path}/{file_name}",
+                                         mode="w" if isinstance(content, str) else "wb") as f:
+            await f.write(content)
 
     async def save_index(self):
         index_path = f"{self.save_diretory}/{self.source}/index"
@@ -163,13 +165,32 @@ async def parse_and_store_article(self, html_content, preview_article: ArticlePr
         remote_imgs = img_re.findall(article_content_markdown)
 
         for url in remote_imgs:
+            if url == '': continue
             local_img_url = "img/" + hashlib.sha256(f"{url}".encode()).hexdigest() + "." + url.split(".")[-1]
-            article_content_markdown = article_content_markdown.replace(url, local_img_url)
+            article_content_markdown = article_content_markdown.replace(url, local_img_url, 1)
 
         title = preview_article.title
         await self.save_item(article_content_markdown, f"{title}.md")
 
-        return remote_imgs
+        img_sem = asyncio.Semaphore(5)
+        fetch_img_tasks = []
+        for img_url in remote_imgs:
+            img_url = img_url.split('#')[0]
+            if not self.img_url_verify(img_url): continue
+            img_name = hashlib.sha256(f"{img_url}".encode()).hexdigest() + "." + img_url.split(".")[-1]
+            task = asyncio.create_task(
+                self.parallel_fetch_source(
+                    img_url,
+                    callback=self.fetch_and_store_img,
+                    is_bytes=True,
+                    fresh_sem=img_sem,
+                    # pass img_name to callback
+                    img_name=img_name,
+                    ignore_exception=True
+                )
+            )
+            fetch_img_tasks.append(task)
+        await asyncio.gather(*fetch_img_tasks)
 
     async def parallel_fetch_source(self, uri: str, callback = None, ignore_exception = False, fresh_sem: Union[None, Semaphore] = None, is_bytes=False, **kwargs):
         sem = fresh_sem if fresh_sem is not None else self.fetch_limit_sem
@@ -196,3 +217,7 @@ async def fetch_and_store_img(self, img: bytes, img_name: str):
 
     async def close(self):
         await self.client.close()
+
+    @abstractmethod
+    def img_url_verify(self, url: str) -> bool:
+        ...
diff --git a/src/black_fish/executor.py b/src/black_fish/executor.py
@@ -1,7 +1,6 @@
 from typing import List
 from loguru import logger
 import asyncio
-import hashlib
 
 from .base_spider import BaseArticleSpider
 from .config import Config
@@ -35,28 +34,7 @@ async def run_spider(self, spider: BaseArticleSpider):
         logger.info("{} spider find {} fresh articles", spider.source, len(fresh_articles))
 
         spider.article_preview = fresh_articles
-        full_article_imgs = await spider.fetch_remote_full_articles(fresh_articles)
-
-        remote_imgs = [img for img_list in full_article_imgs for img in img_list]
-        remote_imgs: List[str] = list(set(remote_imgs))
-        logger.info("{} spider fetch {} images", spider.source, len(remote_imgs))
-
-        img_sem = asyncio.Semaphore(10)
-        fetch_img_tasks = []
-        for img_url in remote_imgs:
-            img_name = hashlib.sha256(f"{img_url}".encode()).hexdigest() + "." + img_url.split(".")[-1]
-            task = asyncio.create_task(
-                spider.parallel_fetch_source(
-                    img_url,
-                    callback=spider.fetch_and_store_img,
-                    is_bytes=True,
-                    fresh_sem=img_sem,
-                    # pass img_name to callback
-                    img_name=img_name
-                )
-            )
-            fetch_img_tasks.append(task)
-        await asyncio.gather(*fetch_img_tasks)
+        await spider.fetch_remote_full_articles(fresh_articles)
 
         logger.info("{} spider save index", spider.source)
         await spider.save_index()
diff --git a/src/black_fish/spiders/tttang.py b/src/black_fish/spiders/tttang.py
@@ -6,7 +6,6 @@
 
 from ..base_spider import BaseArticleSpider, ArticlePreview
 
-
 BASE_URL = "https://tttang.com"
 
 class TTTangSpider(BaseArticleSpider):
@@ -18,6 +17,7 @@ def __init__(self):
                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
                 "Referer": "https://tttang.com/"
             },
+            timeout=aiohttp.ClientTimeout(8)
         )
 
     async def prepare_for_run(self):
diff --git a/src/black_fish/spiders/xz.py b/src/black_fish/spiders/xz.py
@@ -3,10 +3,11 @@
 from parsel import Selector
 from loguru import logger
 from markdownify import markdownify as md
+from urllib.parse import urlparse
+from os import path
 
 from ..base_spider import BaseArticleSpider, ArticlePreview
 
-
 BASE_URL = "https://xz.aliyun.com"
 
 class XZSpider(BaseArticleSpider):
@@ -21,7 +22,8 @@ def __init__(self):
             cookies={
                 "acw_tc": "1a0c380917221828212171387e0037facf793235036fdff4cea42cedeca93f",
                 "acw_sc__v3": "66a66cab88822a5511a46d6925e14924fbf9a664"
-            }
+            },
+            timeout=aiohttp.ClientTimeout(8)
         )
 
     async def prepare_for_run(self):
@@ -70,4 +72,11 @@ async def parse_preview_articles_on_one_page(self, text):
             # current page and next page have cached
             if article_hash in self.article_preview_local.keys():
                 self.find_local_cache = True
-            self.add_article_preview(title=title, url=url, publish_at=time)
+            self.add_article_preview(title=title, url=url, publish_at=time)
+
+    def img_url_verify(self, url: str) -> bool:
+        url_ = urlparse(url)
+        if url_.scheme not in ("http", "https"): return False
+        if path.splitext(url_.path)[1] not in ['.jpg', '.png', '.gif', '.webp', '.jpeg']:
+            return False
+        return True