From 1a7e9e4565865fd01481703582f6f4b7ed3ded02 Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Sun, 28 Aug 2022 19:05:06 +0200 Subject: [PATCH] Fix OG metadata processing --- app/incoming_activities.py | 11 +++++++---- app/utils/opengraph.py | 17 +++++++++++++++-- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/app/incoming_activities.py b/app/incoming_activities.py index e803ad9..b1135cc 100644 --- a/app/incoming_activities.py +++ b/app/incoming_activities.py @@ -112,10 +112,13 @@ async def process_next_incoming_activity( if next_activity.ap_object and next_activity.sent_by_ap_actor_id: try: async with db_session.begin_nested(): - await save_to_inbox( - db_session, - next_activity.ap_object, - next_activity.sent_by_ap_actor_id, + await asyncio.wait_for( + save_to_inbox( + db_session, + next_activity.ap_object, + next_activity.sent_by_ap_actor_id, + ), + timeout=60, ) except httpx.TimeoutException as exc: url = exc._request.url if exc._request else None diff --git a/app/utils/opengraph.py b/app/utils/opengraph.py index 426c6a0..00571e7 100644 --- a/app/utils/opengraph.py +++ b/app/utils/opengraph.py @@ -1,3 +1,4 @@ +import asyncio import mimetypes import re from typing import Any @@ -36,7 +37,7 @@ def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None: # FIXME some page have no raw = { "url": url, - "title": soup.find("title").text, + "title": soup.find("title").text.strip(), "image": None, "description": None, "site_name": urlparse(url).hostname, @@ -124,9 +125,21 @@ async def og_meta_from_note( ) -> list[dict[str, Any]]: og_meta = [] urls = await external_urls(db_session, ro) + logger.debug(f"Lookig OG metadata in {urls=}") for url in urls: + logger.debug(f"Processing {url}") try: - maybe_og_meta = await _og_meta_from_url(url) + maybe_og_meta = None + try: + maybe_og_meta = await asyncio.wait_for( + _og_meta_from_url(url), + timeout=5, + ) + except asyncio.TimeoutError: + logger.info(f"Timing out fetching {url}") + except Exception: + logger.exception(f"Failed scrap OG meta for {url}") + if maybe_og_meta: og_meta.append(maybe_og_meta.dict()) except httpx.HTTPError: