From 4f98ff6bbf5a00ab47d9212d4fb7cb50f948aed9 Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Mon, 15 Aug 2022 10:27:58 +0200 Subject: [PATCH] Tweak URL parsing --- app/actor.py | 2 +- app/boxes.py | 6 +++--- app/utils/indieauth.py | 2 +- app/utils/opengraph.py | 6 +++--- app/utils/privacy_replace.py | 4 +++- tests/factories.py | 2 +- 6 files changed, 12 insertions(+), 10 deletions(-) diff --git a/app/actor.py b/app/actor.py index 6ebdf3f..9958bc9 100644 --- a/app/actor.py +++ b/app/actor.py @@ -116,7 +116,7 @@ class Actor: @cached_property def server(self) -> str: - return urlparse(self.ap_id).netloc + return urlparse(self.ap_id).hostname # type: ignore class RemoteActor(Actor): diff --git a/app/boxes.py b/app/boxes.py index 078759c..666c275 100644 --- a/app/boxes.py +++ b/app/boxes.py @@ -1255,7 +1255,7 @@ async def _process_note_object( is_mention = True inbox_object = models.InboxObject( - server=urlparse(ro.ap_id).netloc, + server=urlparse(ro.ap_id).hostname, actor_id=from_actor.id, ap_actor_id=from_actor.ap_id, ap_type=ro.ap_type, @@ -1521,7 +1521,7 @@ async def save_to_inbox( ) inbox_object = models.InboxObject( - server=urlparse(activity_ro.ap_id).netloc, + server=urlparse(activity_ro.ap_id).hostname, actor_id=actor.id, ap_actor_id=actor.ap_id, ap_type=activity_ro.ap_type, @@ -1668,7 +1668,7 @@ async def save_to_inbox( ) announced_object = RemoteObject(announced_raw_object, announced_actor) announced_inbox_object = models.InboxObject( - server=urlparse(announced_object.ap_id).netloc, + server=urlparse(announced_object.ap_id).hostname, actor_id=announced_actor.id, ap_actor_id=announced_actor.ap_id, ap_type=announced_object.ap_type, diff --git a/app/utils/indieauth.py b/app/utils/indieauth.py index 00d2714..44a6b50 100644 --- a/app/utils/indieauth.py +++ b/app/utils/indieauth.py @@ -24,7 +24,7 @@ def _get_prop(props: dict[str, Any], name: str, default=None) -> Any: async def get_client_id_data(url: str) -> IndieAuthClient | None: # Don't fetch localhost URL - if urlparse(url).netloc == "localhost": + if urlparse(url).hostname == "localhost": return IndieAuthClient( logo=None, name=url, diff --git a/app/utils/opengraph.py b/app/utils/opengraph.py index 1735936..426c6a0 100644 --- a/app/utils/opengraph.py +++ b/app/utils/opengraph.py @@ -39,7 +39,7 @@ def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None: "title": soup.find("title").text, "image": None, "description": None, - "site_name": urlparse(url).netloc, + "site_name": urlparse(url).hostname, } for field in OpenGraphMeta.__fields__.keys(): og_field = f"og:{field}" @@ -60,7 +60,7 @@ async def external_urls( db_session: AsyncSession, ro: ap_object.RemoteObject | OutboxObject | InboxObject, ) -> set[str]: - note_host = urlparse(ro.ap_id).netloc + note_host = urlparse(ro.ap_id).hostname tags_hrefs = set() for tag in ro.tags: @@ -84,7 +84,7 @@ async def external_urls( mimetype, _ = mimetypes.guess_type(h) if ( ph.scheme in {"http", "https"} - and ph.netloc != note_host + and ph.hostname != note_host and is_url_valid(h) and ( not mimetype diff --git a/app/utils/privacy_replace.py b/app/utils/privacy_replace.py index c3795b6..e815e85 100644 --- a/app/utils/privacy_replace.py +++ b/app/utils/privacy_replace.py @@ -27,11 +27,13 @@ def replace_url(u: str) -> str: try: parsed_href = urlparse(u) + if not parsed_href.hostname: + raise ValueError("Missing hostname") except Exception: logger.warning(f"Failed to parse url={u}") return u - if new_netloc := PRIVACY_REPLACE.get(parsed_href.netloc.removeprefix("www.")): + if new_netloc := PRIVACY_REPLACE.get(parsed_href.hostname.removeprefix("www.")): return parsed_href._replace(netloc=new_netloc).geturl() return u diff --git a/tests/factories.py b/tests/factories.py index 5f57a80..a3693ef 100644 --- a/tests/factories.py +++ b/tests/factories.py @@ -220,7 +220,7 @@ class InboxObjectFactory(factory.alchemy.SQLAlchemyModelFactory): if "published" in ro.ap_object: ap_published_at = isoparse(ro.ap_object["published"]) return cls( - server=urlparse(ro.ap_id).netloc, + server=urlparse(ro.ap_id).hostname, actor_id=actor.id, ap_actor_id=actor.ap_id, ap_type=ro.ap_type,