Skip to content

Commit 5b169a9

Browse files
committed
cleanup
1 parent c25ded5 commit 5b169a9

10 files changed

+14
-1291
lines changed

.python-version

-1
This file was deleted.

requirements-dev.lock

-103
This file was deleted.

requirements.lock

-93
This file was deleted.

requirements.txt

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
aiohttp
2+
anyio
3+
loguru
4+
markdownify
5+
parsel

src/black_fish/base_spider.py

-4
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import asyncio
88
import re
99
import os
10-
import aiofiles
1110
import json
1211
from loguru import logger
1312

@@ -90,9 +89,6 @@ async def save_item(self, content: Union[str, bytes], file_name: str, is_img: bo
9089
save_to_item_path = f"{save_to_item_path}/img"
9190
os.makedirs(save_to_item_path, exist_ok=True)
9291

93-
# async with aiofiles.open(f"{save_to_item_path}/{file_name}", "w" if isinstance(content, str) else "wb") as f:
94-
# await f.write(content)
95-
9692
with open(f"{save_to_item_path}/{file_name}", "w" if isinstance(content, str) else "wb") as f:
9793
f.write(content)
9894

src/black_fish/executor.py

+3-51
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from typing import List
2-
from .base_spider import BaseArticleSpider
3-
from .config import Config
42
from loguru import logger
53
import asyncio
64
import hashlib
75

6+
from .base_spider import BaseArticleSpider
7+
from .config import Config
8+
89
class Executor(object):
910
def __init__(self, config: Config):
1011
self.config = config
@@ -22,55 +23,6 @@ async def run(self):
2223
spider_tasks.append(task)
2324
await asyncio.gather(*spider_tasks)
2425

25-
# # fetch remote article preview via spiders
26-
# total_remote_articles = await asyncio.gather(*[spider.fetch_remote_preview_articles() for spider in self.spiders])
27-
28-
# fetch_tasks: List[asyncio.Task] = []
29-
# for spider, remote_articles in zip(self.spiders, total_remote_articles):
30-
31-
# logger.info("{} spider fetch {} remote articles", spider.source, len(remote_articles))
32-
33-
# # load local article preview
34-
# await spider.load_index()
35-
# logger.info("{} spider load {} local articles", spider.source, len(spider.article_preview_local))
36-
37-
# # compare remote and local
38-
# fresh_articles = [article for article in remote_articles if article.hash not in spider.article_preview_local]
39-
# logger.info("{} spider find {} fresh articles", spider.source, len(fresh_articles))
40-
41-
# # fetch full article content
42-
# spider.article_preview = fresh_articles
43-
# fetch_task = asyncio.create_task(spider.fetch_remote_full_articles(fresh_articles))
44-
# fetch_tasks.append(fetch_task)
45-
46-
# full_articles_result: List[asyncio.Task] = await asyncio.gather(*fetch_tasks)
47-
48-
# fetch_img_tasks = []
49-
# for spider, remote_imgs in zip(self.spiders, full_articles_result):
50-
# remote_imgs = [img for img_list in remote_imgs for img in img_list]
51-
# # unique img
52-
# remote_imgs: List[str] = list(set(remote_imgs))
53-
# logger.info("{} spider fetch {} images", spider.source, len(remote_imgs))
54-
# img_sem = asyncio.Semaphore(10)
55-
# for img_url in remote_imgs:
56-
# img_name = hashlib.sha256(f"{img_url}".encode()).hexdigest() + "." + img_url.split(".")[-1]
57-
# # download img
58-
# task = asyncio.create_task(
59-
# spider.parallel_fetch_source(
60-
# img_url,
61-
# callback=spider.fetch_and_store_img,
62-
# is_bytes=True,
63-
# ignore_exception=True,
64-
# img_name=img_name,
65-
# fresh_sem=img_sem
66-
# )
67-
# )
68-
# fetch_img_tasks.append(task)
69-
70-
# await asyncio.gather(*fetch_img_tasks)
71-
72-
73-
7426
async def run_spider(self, spider: BaseArticleSpider):
7527

7628
await spider.load_index()

src/black_fish/spiders/tttang.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
from typing import List
2-
from black_fish.base_spider import BaseArticleSpider, ArticlePreview
32
import aiohttp
43
from parsel import Selector
54
from loguru import logger
6-
import asyncio
75
from markdownify import markdownify as md
86

7+
from ..base_spider import BaseArticleSpider, ArticlePreview
8+
9+
910
BASE_URL = "https://tttang.com"
1011

1112
class TTTangSpider(BaseArticleSpider):

src/black_fish/spiders/xz.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
from typing import List
2-
from black_fish.base_spider import BaseArticleSpider, ArticlePreview
32
import aiohttp
43
from parsel import Selector
54
from loguru import logger
6-
import asyncio
75
from markdownify import markdownify as md
86

7+
from ..base_spider import BaseArticleSpider, ArticlePreview
8+
9+
910
BASE_URL = "https://xz.aliyun.com"
1011

1112
class XZSpider(BaseArticleSpider):

0 commit comments

Comments
 (0)