1
1
from typing import List
2
- from .base_spider import BaseArticleSpider
3
- from .config import Config
4
2
from loguru import logger
5
3
import asyncio
6
4
import hashlib
7
5
6
+ from .base_spider import BaseArticleSpider
7
+ from .config import Config
8
+
8
9
class Executor (object ):
9
10
def __init__ (self , config : Config ):
10
11
self .config = config
@@ -22,55 +23,6 @@ async def run(self):
22
23
spider_tasks .append (task )
23
24
await asyncio .gather (* spider_tasks )
24
25
25
- # # fetch remote article preview via spiders
26
- # total_remote_articles = await asyncio.gather(*[spider.fetch_remote_preview_articles() for spider in self.spiders])
27
-
28
- # fetch_tasks: List[asyncio.Task] = []
29
- # for spider, remote_articles in zip(self.spiders, total_remote_articles):
30
-
31
- # logger.info("{} spider fetch {} remote articles", spider.source, len(remote_articles))
32
-
33
- # # load local article preview
34
- # await spider.load_index()
35
- # logger.info("{} spider load {} local articles", spider.source, len(spider.article_preview_local))
36
-
37
- # # compare remote and local
38
- # fresh_articles = [article for article in remote_articles if article.hash not in spider.article_preview_local]
39
- # logger.info("{} spider find {} fresh articles", spider.source, len(fresh_articles))
40
-
41
- # # fetch full article content
42
- # spider.article_preview = fresh_articles
43
- # fetch_task = asyncio.create_task(spider.fetch_remote_full_articles(fresh_articles))
44
- # fetch_tasks.append(fetch_task)
45
-
46
- # full_articles_result: List[asyncio.Task] = await asyncio.gather(*fetch_tasks)
47
-
48
- # fetch_img_tasks = []
49
- # for spider, remote_imgs in zip(self.spiders, full_articles_result):
50
- # remote_imgs = [img for img_list in remote_imgs for img in img_list]
51
- # # unique img
52
- # remote_imgs: List[str] = list(set(remote_imgs))
53
- # logger.info("{} spider fetch {} images", spider.source, len(remote_imgs))
54
- # img_sem = asyncio.Semaphore(10)
55
- # for img_url in remote_imgs:
56
- # img_name = hashlib.sha256(f"{img_url}".encode()).hexdigest() + "." + img_url.split(".")[-1]
57
- # # download img
58
- # task = asyncio.create_task(
59
- # spider.parallel_fetch_source(
60
- # img_url,
61
- # callback=spider.fetch_and_store_img,
62
- # is_bytes=True,
63
- # ignore_exception=True,
64
- # img_name=img_name,
65
- # fresh_sem=img_sem
66
- # )
67
- # )
68
- # fetch_img_tasks.append(task)
69
-
70
- # await asyncio.gather(*fetch_img_tasks)
71
-
72
-
73
-
74
26
async def run_spider (self , spider : BaseArticleSpider ):
75
27
76
28
await spider .load_index ()
0 commit comments