Switch Markdown parser
parent
5a20b9d23a
commit
881d0ad899
152
app/source.py
152
app/source.py
|
@ -1,52 +1,118 @@
|
||||||
import re
|
import re
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
from markdown import markdown
|
from mistletoe import Document # type: ignore
|
||||||
|
from mistletoe.html_renderer import HTMLRenderer # type: ignore
|
||||||
|
from mistletoe.span_token import SpanToken # type: ignore
|
||||||
|
from pygments import highlight # type: ignore
|
||||||
|
from pygments.formatters import HtmlFormatter # type: ignore
|
||||||
|
from pygments.lexers import get_lexer_by_name as get_lexer # type: ignore
|
||||||
|
from pygments.lexers import guess_lexer # type: ignore
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
|
|
||||||
from app import webfinger
|
from app import webfinger
|
||||||
from app.config import BASE_URL
|
from app.config import BASE_URL
|
||||||
|
from app.config import CODE_HIGHLIGHTING_THEME
|
||||||
from app.database import AsyncSession
|
from app.database import AsyncSession
|
||||||
from app.utils import emoji
|
from app.utils import emoji
|
||||||
|
|
||||||
if typing.TYPE_CHECKING:
|
if typing.TYPE_CHECKING:
|
||||||
from app.actor import Actor
|
from app.actor import Actor
|
||||||
|
|
||||||
|
_FORMATTER = HtmlFormatter(style=CODE_HIGHLIGHTING_THEME)
|
||||||
def _set_a_attrs(attrs, new=False):
|
|
||||||
attrs[(None, "target")] = "_blank"
|
|
||||||
attrs[(None, "class")] = "external"
|
|
||||||
attrs[(None, "rel")] = "noopener"
|
|
||||||
attrs[(None, "title")] = attrs[(None, "href")]
|
|
||||||
return attrs
|
|
||||||
|
|
||||||
|
|
||||||
_HASHTAG_REGEX = re.compile(r"(#[\d\w]+)")
|
_HASHTAG_REGEX = re.compile(r"(#[\d\w]+)")
|
||||||
_MENTION_REGEX = re.compile(r"@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+")
|
_MENTION_REGEX = re.compile(r"@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+")
|
||||||
|
|
||||||
|
|
||||||
def hashtagify(content: str) -> tuple[str, list[dict[str, str]]]:
|
class AutoLink(SpanToken):
|
||||||
tags = []
|
parse_inner = False
|
||||||
hashtags = re.findall(_HASHTAG_REGEX, content)
|
precedence = 10
|
||||||
hashtags = sorted(set(hashtags), reverse=True) # unique tags, longest first
|
pattern = re.compile(
|
||||||
for hashtag in hashtags:
|
"(https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*))" # noqa: E501
|
||||||
tag = hashtag[1:]
|
)
|
||||||
|
|
||||||
|
def __init__(self, match_obj: re.Match) -> None:
|
||||||
|
self.target = match_obj.group()
|
||||||
|
|
||||||
|
|
||||||
|
class Mention(SpanToken):
|
||||||
|
parse_inner = False
|
||||||
|
precedence = 10
|
||||||
|
pattern = re.compile(r"(@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+)")
|
||||||
|
|
||||||
|
def __init__(self, match_obj: re.Match) -> None:
|
||||||
|
self.target = match_obj.group()
|
||||||
|
|
||||||
|
|
||||||
|
class Hashtag(SpanToken):
|
||||||
|
parse_inner = False
|
||||||
|
precedence = 10
|
||||||
|
pattern = re.compile(r"(#[\d\w]+)")
|
||||||
|
|
||||||
|
def __init__(self, match_obj: re.Match) -> None:
|
||||||
|
self.target = match_obj.group()
|
||||||
|
|
||||||
|
|
||||||
|
class CustomRenderer(HTMLRenderer):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
mentioned_actors: dict[str, "Actor"] = {},
|
||||||
|
enable_mentionify: bool = True,
|
||||||
|
enable_hashtagify: bool = True,
|
||||||
|
) -> None:
|
||||||
|
extra_tokens = []
|
||||||
|
if enable_mentionify:
|
||||||
|
extra_tokens.append(Mention)
|
||||||
|
if enable_hashtagify:
|
||||||
|
extra_tokens.append(Hashtag)
|
||||||
|
super().__init__(AutoLink, *extra_tokens)
|
||||||
|
|
||||||
|
self.tags: list[dict[str, str]] = []
|
||||||
|
self.mentioned_actors = mentioned_actors
|
||||||
|
|
||||||
|
def render_auto_link(self, token: AutoLink) -> str:
|
||||||
|
template = '<a href="{target}" rel="noopener">{inner}</a>'
|
||||||
|
target = self.escape_url(token.target)
|
||||||
|
return template.format(target=target, inner=target)
|
||||||
|
|
||||||
|
def render_mention(self, token: Mention) -> str:
|
||||||
|
mention = token.target
|
||||||
|
actor = self.mentioned_actors.get(mention)
|
||||||
|
if not actor:
|
||||||
|
return mention
|
||||||
|
|
||||||
|
self.tags.append(dict(type="Mention", href=actor.ap_id, name=mention))
|
||||||
|
|
||||||
|
link = f'<span class="h-card"><a href="{actor.url}" class="u-url mention">{actor.handle}</a></span>' # noqa: E501
|
||||||
|
return link
|
||||||
|
|
||||||
|
def render_hashtag(self, token: Hashtag) -> str:
|
||||||
|
tag = token.target[1:]
|
||||||
link = f'<a href="{BASE_URL}/t/{tag}" class="mention hashtag" rel="tag">#<span>{tag}</span></a>' # noqa: E501
|
link = f'<a href="{BASE_URL}/t/{tag}" class="mention hashtag" rel="tag">#<span>{tag}</span></a>' # noqa: E501
|
||||||
tags.append(dict(href=f"{BASE_URL}/t/{tag}", name=hashtag, type="Hashtag"))
|
self.tags.append(
|
||||||
content = content.replace(hashtag, link)
|
dict(href=f"{BASE_URL}/t/{tag}", name=token.target, type="Hashtag")
|
||||||
return content, tags
|
)
|
||||||
|
return link
|
||||||
|
|
||||||
|
def render_block_code(self, token: typing.Any) -> str:
|
||||||
|
code = token.children[0].content
|
||||||
|
lexer = get_lexer(token.language) if token.language else guess_lexer(code)
|
||||||
|
return highlight(code, lexer, _FORMATTER)
|
||||||
|
|
||||||
|
|
||||||
async def _mentionify(
|
async def _prefetch_mentioned_actors(
|
||||||
db_session: AsyncSession,
|
db_session: AsyncSession,
|
||||||
content: str,
|
content: str,
|
||||||
) -> tuple[str, list[dict[str, str]], list["Actor"]]:
|
) -> dict[str, "Actor"]:
|
||||||
from app import models
|
from app import models
|
||||||
from app.actor import fetch_actor
|
from app.actor import fetch_actor
|
||||||
|
|
||||||
tags = []
|
actors = {}
|
||||||
mentioned_actors = []
|
|
||||||
for mention in re.findall(_MENTION_REGEX, content):
|
for mention in re.findall(_MENTION_REGEX, content):
|
||||||
|
if mention in actors:
|
||||||
|
continue
|
||||||
|
|
||||||
_, username, domain = mention.split("@")
|
_, username, domain = mention.split("@")
|
||||||
actor = (
|
actor = (
|
||||||
await db_session.execute(
|
await db_session.execute(
|
||||||
|
@ -63,12 +129,22 @@ async def _mentionify(
|
||||||
continue
|
continue
|
||||||
actor = await fetch_actor(db_session, actor_url)
|
actor = await fetch_actor(db_session, actor_url)
|
||||||
|
|
||||||
mentioned_actors.append(actor)
|
actors[mention] = actor
|
||||||
tags.append(dict(type="Mention", href=actor.ap_id, name=mention))
|
|
||||||
|
|
||||||
link = f'<span class="h-card"><a href="{actor.url}" class="u-url mention">{actor.handle}</a></span>' # noqa: E501
|
return actors
|
||||||
content = content.replace(mention, link)
|
|
||||||
return content, tags, mentioned_actors
|
|
||||||
|
def hashtagify(content: str) -> tuple[str, list[dict[str, str]]]:
|
||||||
|
# TODO: fix this, switch to mistletoe?
|
||||||
|
tags = []
|
||||||
|
hashtags = re.findall(_HASHTAG_REGEX, content)
|
||||||
|
hashtags = sorted(set(hashtags), reverse=True) # unique tags, longest first
|
||||||
|
for hashtag in hashtags:
|
||||||
|
tag = hashtag[1:]
|
||||||
|
link = f'<a href="{BASE_URL}/t/{tag}" class="mention hashtag" rel="tag">#<span>{tag}</span></a>' # noqa: E501
|
||||||
|
tags.append(dict(href=f"{BASE_URL}/t/{tag}", name=hashtag, type="Hashtag"))
|
||||||
|
content = content.replace(hashtag, link)
|
||||||
|
return content, tags
|
||||||
|
|
||||||
|
|
||||||
async def markdownify(
|
async def markdownify(
|
||||||
|
@ -82,17 +158,19 @@ async def markdownify(
|
||||||
|
|
||||||
"""
|
"""
|
||||||
tags = []
|
tags = []
|
||||||
mentioned_actors: list["Actor"] = []
|
mentioned_actors: dict[str, "Actor"] = {}
|
||||||
if enable_hashtagify:
|
|
||||||
content, hashtag_tags = hashtagify(content)
|
|
||||||
tags.extend(hashtag_tags)
|
|
||||||
if enable_mentionify:
|
if enable_mentionify:
|
||||||
content, mention_tags, mentioned_actors = await _mentionify(db_session, content)
|
mentioned_actors = await _prefetch_mentioned_actors(db_session, content)
|
||||||
tags.extend(mention_tags)
|
|
||||||
|
with CustomRenderer(
|
||||||
|
mentioned_actors=mentioned_actors,
|
||||||
|
enable_mentionify=enable_mentionify,
|
||||||
|
enable_hashtagify=enable_hashtagify,
|
||||||
|
) as renderer:
|
||||||
|
rendered_content = renderer.render(Document(content))
|
||||||
|
tags.extend(renderer.tags)
|
||||||
|
|
||||||
# Handle custom emoji
|
# Handle custom emoji
|
||||||
tags.extend(emoji.tags(content))
|
tags.extend(emoji.tags(content))
|
||||||
|
|
||||||
content = markdown(content, extensions=["mdx_linkify", "fenced_code"])
|
return rendered_content, tags, list(mentioned_actors.values())
|
||||||
|
|
||||||
return content, tags, mentioned_actors
|
|
||||||
|
|
|
@ -648,6 +648,14 @@ BeautifulSoup4 = ">=4.6.0"
|
||||||
html5lib = ">=1.0.1"
|
html5lib = ">=1.0.1"
|
||||||
requests = ">=2.18.4"
|
requests = ">=2.18.4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "mistletoe"
|
||||||
|
version = "0.9.0"
|
||||||
|
description = "A fast, extensible Markdown parser in pure Python."
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = "~=3.5"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mypy"
|
name = "mypy"
|
||||||
version = "0.960"
|
version = "0.960"
|
||||||
|
@ -1275,7 +1283,7 @@ dev = ["pytest (>=4.6.2)", "black (>=19.3b0)"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "1.1"
|
lock-version = "1.1"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "84b3a6dcfc055fb0712c6abbf1bf94d9526eda940c4ddb0bd275664e68a4c3e3"
|
content-hash = "bc8585a0da6f4d4e54afafde1da287ed75ed6544981d11bba561a7678bc31b8f"
|
||||||
|
|
||||||
[metadata.files]
|
[metadata.files]
|
||||||
aiosqlite = [
|
aiosqlite = [
|
||||||
|
@ -1832,6 +1840,10 @@ mdx-linkify = [
|
||||||
mf2py = [
|
mf2py = [
|
||||||
{file = "mf2py-1.1.2.tar.gz", hash = "sha256:84f1f8f2ff3f1deb1c30be497e7ccd805452996a662fd4a77f09e0105bede2c9"},
|
{file = "mf2py-1.1.2.tar.gz", hash = "sha256:84f1f8f2ff3f1deb1c30be497e7ccd805452996a662fd4a77f09e0105bede2c9"},
|
||||||
]
|
]
|
||||||
|
mistletoe = [
|
||||||
|
{file = "mistletoe-0.9.0-py3-none-any.whl", hash = "sha256:11316e2fe0be422a8248293ad0efbee9ad0c6f3683b2f45bc6b989ea17a68c74"},
|
||||||
|
{file = "mistletoe-0.9.0.tar.gz", hash = "sha256:3cb96d78226d08f0d3bf09efcaf330d23902492006e18b2c06558e8b86bf7faf"},
|
||||||
|
]
|
||||||
mypy = [
|
mypy = [
|
||||||
{file = "mypy-0.960-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3a3e525cd76c2c4f90f1449fd034ba21fcca68050ff7c8397bb7dd25dd8b8248"},
|
{file = "mypy-0.960-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3a3e525cd76c2c4f90f1449fd034ba21fcca68050ff7c8397bb7dd25dd8b8248"},
|
||||||
{file = "mypy-0.960-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7a76dc4f91e92db119b1be293892df8379b08fd31795bb44e0ff84256d34c251"},
|
{file = "mypy-0.960-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7a76dc4f91e92db119b1be293892df8379b08fd31795bb44e0ff84256d34c251"},
|
||||||
|
|
|
@ -45,6 +45,7 @@ boussole = "^2.0.0"
|
||||||
uvicorn = {extras = ["standard"], version = "^0.18.3"}
|
uvicorn = {extras = ["standard"], version = "^0.18.3"}
|
||||||
Brotli = "^1.0.9"
|
Brotli = "^1.0.9"
|
||||||
greenlet = "^1.1.3"
|
greenlet = "^1.1.3"
|
||||||
|
mistletoe = "^0.9.0"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
black = "^22.3.0"
|
black = "^22.3.0"
|
||||||
|
|
|
@ -179,7 +179,7 @@ def test_send_create_activity__with_attachment(
|
||||||
outbox_object = db.execute(select(models.OutboxObject)).scalar_one()
|
outbox_object = db.execute(select(models.OutboxObject)).scalar_one()
|
||||||
assert outbox_object.ap_type == "Note"
|
assert outbox_object.ap_type == "Note"
|
||||||
assert outbox_object.summary is None
|
assert outbox_object.summary is None
|
||||||
assert outbox_object.content == "<p>hello</p>"
|
assert outbox_object.content == "<p>hello</p>\n"
|
||||||
assert len(outbox_object.attachments) == 1
|
assert len(outbox_object.attachments) == 1
|
||||||
attachment = outbox_object.attachments[0]
|
attachment = outbox_object.attachments[0]
|
||||||
assert attachment.type == "Document"
|
assert attachment.type == "Document"
|
||||||
|
@ -227,7 +227,7 @@ def test_send_create_activity__no_content_with_cw_and_attachments(
|
||||||
outbox_object = db.execute(select(models.OutboxObject)).scalar_one()
|
outbox_object = db.execute(select(models.OutboxObject)).scalar_one()
|
||||||
assert outbox_object.ap_type == "Note"
|
assert outbox_object.ap_type == "Note"
|
||||||
assert outbox_object.summary is None
|
assert outbox_object.summary is None
|
||||||
assert outbox_object.content == "<p>cw</p>"
|
assert outbox_object.content == "<p>cw</p>\n"
|
||||||
assert len(outbox_object.attachments) == 1
|
assert len(outbox_object.attachments) == 1
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue