Skip to content

Commit 793a939

Browse files
committed
Fix OG metadata scraping and improve workers
1 parent c3eb44a commit 793a939

File tree

5 files changed

+37
-7
lines changed

5 files changed

+37
-7
lines changed

app/incoming_activities.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from datetime import datetime
44
from datetime import timedelta
55

6-
import httpx
76
from loguru import logger
87
from sqlalchemy import func
98
from sqlalchemy import select
@@ -108,6 +107,7 @@ async def process_next_incoming_activity(
108107

109108
next_activity.tries = next_activity.tries + 1
110109
next_activity.last_try = now()
110+
await db_session.commit()
111111

112112
if next_activity.ap_object and next_activity.sent_by_ap_actor_id:
113113
try:
@@ -120,13 +120,16 @@ async def process_next_incoming_activity(
120120
),
121121
timeout=60,
122122
)
123-
except httpx.TimeoutException as exc:
124-
url = exc._request.url if exc._request else None
125-
logger.error(f"Failed, HTTP timeout when fetching {url}")
123+
except asyncio.exceptions.TimeoutError:
124+
logger.error("Activity took too long to process")
125+
await db_session.rollback()
126+
await db_session.refresh(next_activity)
126127
next_activity.error = traceback.format_exc()
127128
_set_next_try(next_activity)
128129
except Exception:
129130
logger.exception("Failed")
131+
await db_session.rollback()
132+
await db_session.refresh(next_activity)
130133
next_activity.error = traceback.format_exc()
131134
_set_next_try(next_activity)
132135
else:

app/utils/opengraph.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
import asyncio
22
import mimetypes
33
import re
4+
import signal
5+
from concurrent.futures import TimeoutError
46
from typing import Any
57
from urllib.parse import urlparse
68

79
import httpx
810
from bs4 import BeautifulSoup # type: ignore
911
from loguru import logger
12+
from pebble import concurrent # type: ignore
1013
from pydantic import BaseModel
1114

1215
from app import activitypub as ap
@@ -29,7 +32,11 @@ class OpenGraphMeta(BaseModel):
2932
site_name: str
3033

3134

35+
@concurrent.process(timeout=5)
3236
def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
37+
# Prevent SIGTERM to bubble up to the worker
38+
signal.signal(signal.SIGTERM, signal.SIG_IGN)
39+
3340
soup = BeautifulSoup(html, "html5lib")
3441
ogs = {
3542
og.attrs["property"]: og.attrs.get("content")
@@ -58,6 +65,10 @@ def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
5865
return OpenGraphMeta.parse_obj(raw)
5966

6067

68+
def scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
69+
return _scrap_og_meta(url, html).result()
70+
71+
6172
async def external_urls(
6273
db_session: AsyncSession,
6374
ro: ap_object.RemoteObject | OutboxObject | InboxObject,
@@ -126,7 +137,10 @@ async def _og_meta_from_url(url: str) -> OpenGraphMeta | None:
126137
return None
127138

128139
try:
129-
return _scrap_og_meta(url, resp.text)
140+
return scrap_og_meta(url, resp.text)
141+
except TimeoutError:
142+
logger.info(f"Timed out when scraping OG meta for {url}")
143+
return None
130144
except Exception:
131145
logger.info(f"Failed to scrap OG meta for {url}")
132146
return None

app/utils/workers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,5 +69,5 @@ async def run_forever(self) -> None:
6969
logger.info("stopping loop")
7070

7171
async def _shutdown(self, sig: signal.Signals) -> None:
72-
logger.info(f"Caught {signal=}")
72+
logger.info(f"Caught {sig=}")
7373
self._stop_event.set()

poetry.lock

+13-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ uvicorn = {extras = ["standard"], version = "^0.18.3"}
4444
Brotli = "^1.0.9"
4545
greenlet = "^1.1.3"
4646
mistletoe = "^0.9.0"
47+
Pebble = "^5.0.2"
4748

4849
[tool.poetry.dev-dependencies]
4950
black = "^22.3.0"

0 commit comments

Comments
 (0)