Tweak URL parsing

main
Thomas Sileo 2022-08-15 10:27:58 +02:00
parent 1e6a290fb3
commit 4f98ff6bbf
6 changed files with 12 additions and 10 deletions

View File

@ -116,7 +116,7 @@ class Actor:
@cached_property @cached_property
def server(self) -> str: def server(self) -> str:
return urlparse(self.ap_id).netloc return urlparse(self.ap_id).hostname # type: ignore
class RemoteActor(Actor): class RemoteActor(Actor):

View File

@ -1255,7 +1255,7 @@ async def _process_note_object(
is_mention = True is_mention = True
inbox_object = models.InboxObject( inbox_object = models.InboxObject(
server=urlparse(ro.ap_id).netloc, server=urlparse(ro.ap_id).hostname,
actor_id=from_actor.id, actor_id=from_actor.id,
ap_actor_id=from_actor.ap_id, ap_actor_id=from_actor.ap_id,
ap_type=ro.ap_type, ap_type=ro.ap_type,
@ -1521,7 +1521,7 @@ async def save_to_inbox(
) )
inbox_object = models.InboxObject( inbox_object = models.InboxObject(
server=urlparse(activity_ro.ap_id).netloc, server=urlparse(activity_ro.ap_id).hostname,
actor_id=actor.id, actor_id=actor.id,
ap_actor_id=actor.ap_id, ap_actor_id=actor.ap_id,
ap_type=activity_ro.ap_type, ap_type=activity_ro.ap_type,
@ -1668,7 +1668,7 @@ async def save_to_inbox(
) )
announced_object = RemoteObject(announced_raw_object, announced_actor) announced_object = RemoteObject(announced_raw_object, announced_actor)
announced_inbox_object = models.InboxObject( announced_inbox_object = models.InboxObject(
server=urlparse(announced_object.ap_id).netloc, server=urlparse(announced_object.ap_id).hostname,
actor_id=announced_actor.id, actor_id=announced_actor.id,
ap_actor_id=announced_actor.ap_id, ap_actor_id=announced_actor.ap_id,
ap_type=announced_object.ap_type, ap_type=announced_object.ap_type,

View File

@ -24,7 +24,7 @@ def _get_prop(props: dict[str, Any], name: str, default=None) -> Any:
async def get_client_id_data(url: str) -> IndieAuthClient | None: async def get_client_id_data(url: str) -> IndieAuthClient | None:
# Don't fetch localhost URL # Don't fetch localhost URL
if urlparse(url).netloc == "localhost": if urlparse(url).hostname == "localhost":
return IndieAuthClient( return IndieAuthClient(
logo=None, logo=None,
name=url, name=url,

View File

@ -39,7 +39,7 @@ def _scrap_og_meta(url: str, html: str) -> OpenGraphMeta | None:
"title": soup.find("title").text, "title": soup.find("title").text,
"image": None, "image": None,
"description": None, "description": None,
"site_name": urlparse(url).netloc, "site_name": urlparse(url).hostname,
} }
for field in OpenGraphMeta.__fields__.keys(): for field in OpenGraphMeta.__fields__.keys():
og_field = f"og:{field}" og_field = f"og:{field}"
@ -60,7 +60,7 @@ async def external_urls(
db_session: AsyncSession, db_session: AsyncSession,
ro: ap_object.RemoteObject | OutboxObject | InboxObject, ro: ap_object.RemoteObject | OutboxObject | InboxObject,
) -> set[str]: ) -> set[str]:
note_host = urlparse(ro.ap_id).netloc note_host = urlparse(ro.ap_id).hostname
tags_hrefs = set() tags_hrefs = set()
for tag in ro.tags: for tag in ro.tags:
@ -84,7 +84,7 @@ async def external_urls(
mimetype, _ = mimetypes.guess_type(h) mimetype, _ = mimetypes.guess_type(h)
if ( if (
ph.scheme in {"http", "https"} ph.scheme in {"http", "https"}
and ph.netloc != note_host and ph.hostname != note_host
and is_url_valid(h) and is_url_valid(h)
and ( and (
not mimetype not mimetype

View File

@ -27,11 +27,13 @@ def replace_url(u: str) -> str:
try: try:
parsed_href = urlparse(u) parsed_href = urlparse(u)
if not parsed_href.hostname:
raise ValueError("Missing hostname")
except Exception: except Exception:
logger.warning(f"Failed to parse url={u}") logger.warning(f"Failed to parse url={u}")
return u return u
if new_netloc := PRIVACY_REPLACE.get(parsed_href.netloc.removeprefix("www.")): if new_netloc := PRIVACY_REPLACE.get(parsed_href.hostname.removeprefix("www.")):
return parsed_href._replace(netloc=new_netloc).geturl() return parsed_href._replace(netloc=new_netloc).geturl()
return u return u

View File

@ -220,7 +220,7 @@ class InboxObjectFactory(factory.alchemy.SQLAlchemyModelFactory):
if "published" in ro.ap_object: if "published" in ro.ap_object:
ap_published_at = isoparse(ro.ap_object["published"]) ap_published_at = isoparse(ro.ap_object["published"])
return cls( return cls(
server=urlparse(ro.ap_id).netloc, server=urlparse(ro.ap_id).hostname,
actor_id=actor.id, actor_id=actor.id,
ap_actor_id=actor.ap_id, ap_actor_id=actor.ap_id,
ap_type=ro.ap_type, ap_type=ro.ap_type,