microblog/app/utils/highlight.py

54 lines
1.8 KiB
Python
Raw Permalink Normal View History

2022-08-29 19:42:54 +00:00
import base64
import hashlib
2022-06-22 18:11:22 +00:00
from functools import lru_cache
from bs4 import BeautifulSoup # type: ignore
from pygments import highlight as phighlight # type: ignore
from pygments.formatters import HtmlFormatter # type: ignore
2022-07-12 20:24:15 +00:00
from pygments.lexers import get_lexer_by_name # type: ignore
2022-06-22 18:11:22 +00:00
from pygments.lexers import guess_lexer # type: ignore
from app.config import CODE_HIGHLIGHTING_THEME
_FORMATTER = HtmlFormatter(style=CODE_HIGHLIGHTING_THEME)
2022-06-22 18:11:22 +00:00
HIGHLIGHT_CSS = _FORMATTER.get_style_defs()
2022-08-29 19:42:54 +00:00
HIGHLIGHT_CSS_HASH = base64.b64encode(
hashlib.sha256(HIGHLIGHT_CSS.encode()).digest()
).decode()
2022-06-22 18:11:22 +00:00
@lru_cache(256)
def highlight(html: str) -> str:
soup = BeautifulSoup(html, "html5lib")
for code in soup.find_all("code"):
if not code.parent.name == "pre":
continue
2022-07-12 20:24:15 +00:00
# Replace <br> tags with line breaks (Mastodon sends code like this)
2022-07-12 07:43:50 +00:00
code_content = (
code.encode_contents().decode().replace("<br>", "\n").replace("<br/>", "\n")
)
2022-07-12 20:24:15 +00:00
# If this comes from a microblog.pub instance we may have the language
# in the class name
2023-01-06 20:21:53 +00:00
if "data-microblogpub-lexer" in code.attrs:
2022-07-12 20:24:15 +00:00
try:
2023-01-06 20:21:53 +00:00
lexer = get_lexer_by_name(code.attrs["data-microblogpub-lexer"])
2022-07-12 20:24:15 +00:00
except Exception:
lexer = guess_lexer(code_content)
2023-01-06 20:21:53 +00:00
# Replace the code with Pygment output
# XXX: the HTML escaping causes issue with Python type annotations
code_content = code_content.replace(") -&gt; ", ") -> ")
code.parent.replaceWith(
BeautifulSoup(
phighlight(code_content, lexer, _FORMATTER), "html5lib"
).body.next
)
2022-07-12 20:24:15 +00:00
else:
2023-01-06 20:21:53 +00:00
code.name = "div"
code["class"] = code.get("class", []) + ["highlight"]
2022-07-12 20:24:15 +00:00
return soup.body.encode_contents().decode()