Skip to content

Commit

Permalink
perf: Rework autorefs replacement to not re-parse the final HTML
Browse files Browse the repository at this point in the history
Define a "transport representation" for autorefs:

    <span data-mkdocstrings-identifier="SomeIdentifier">SomeHTML</span>

First a native Markdown extension translates from the usual `[SomeMarkdown][SomeIdentifier]` to the above, and then the post-process replacement mechanism (which is kept in the same place as before) doesn't need to be so careful and complicated, it just indiscriminately replaces such exact strings.

This is a very big boost in performance and I think is more future-proof.

Other mkdocstrings handlers are also free to use this mechanism: define whatever syntax for autorefs that they need and then insert this exact HTML themselves, for the postprocessing to pick up later. It used to be possible to insert the square-brackets Markdown before, but that was very fragile.

Issue #187: #187
PR #188: #188
  • Loading branch information
oprypin authored Dec 6, 2020
1 parent ac156d2 commit 22a9e4b
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 127 deletions.
2 changes: 0 additions & 2 deletions CREDITS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ These projects were used to build `mkdocstrings`. **Thank you!**

### Direct dependencies
[`autoflake`](https://github.com/myint/autoflake) |
[`beautifulsoup4`](http://www.crummy.com/software/BeautifulSoup/bs4/) |
[`black`](https://github.com/psf/black) |
[`coverage`](https://github.com/nedbat/coveragepy) |
[`failprint`](https://github.com/pawamoy/failprint) |
Expand Down Expand Up @@ -127,7 +126,6 @@ These projects were used to build `mkdocstrings`. **Thank you!**
[`smmap`](https://github.com/gitpython-developers/smmap) |
[`sniffio`](https://github.com/python-trio/sniffio) |
[`snowballstemmer`](https://github.com/snowballstem/snowball) |
[`soupsieve`](https://github.com/facelessuser/soupsieve) |
[`stevedore`](https://docs.openstack.org/stevedore/latest/) |
[`termcolor`](http://pypi.python.org/pypi/termcolor) |
[`testfixtures`](https://github.com/Simplistix/testfixtures) |
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ include = [

[tool.poetry.dependencies]
python = "^3.6"
beautifulsoup4 = "^4.8.2"
Markdown = "^3.3"
mkdocs = "^1.1"
pymdown-extensions = ">=6.3, <9.0"
pytkdocs = ">=0.2.0, <0.10.0"
Expand Down
4 changes: 4 additions & 0 deletions src/mkdocstrings/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

from mkdocstrings.handlers.base import CollectionError, get_handler
from mkdocstrings.loggers import get_logger
from mkdocstrings.references import AutoRefInlineProcessor

log = get_logger(__name__)

Expand Down Expand Up @@ -304,6 +305,7 @@ class MkdocstringsExtension(Extension):
"""

blockprocessor_priority = 75 # Right before markdown.blockprocessors.HashHeaderProcessor
inlineprocessor_priority = 168 # Right after markdown.inlinepatterns.ReferenceInlineProcessor

def __init__(self, config: dict, **kwargs) -> None:
"""
Expand All @@ -329,3 +331,5 @@ def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 (casing: parent me
md.registerExtension(self)
processor = AutoDocProcessor(md.parser, md, self._config)
md.parser.blockprocessors.register(processor, "mkdocstrings", self.blockprocessor_priority)
ref_processor = AutoRefInlineProcessor(md)
md.inlinePatterns.register(ref_processor, "mkdocstrings", self.inlineprocessor_priority)
146 changes: 45 additions & 101 deletions src/mkdocstrings/references.py
Original file line number Diff line number Diff line change
@@ -1,95 +1,59 @@
"""Cross-references module."""

import random
import html
import re
import string
from typing import Callable, Dict, List, Match, Pattern, Tuple
from typing import Callable, Dict, List, Match, Tuple
from xml.etree.ElementTree import Element

from bs4 import BeautifulSoup, NavigableString
from markdown.inlinepatterns import REFERENCE_RE, ReferenceInlineProcessor

AUTO_REF: Pattern = re.compile(r"\[(?P<title>.+?)\]\[(?P<identifier>.*?)\]")
AUTO_REF_RE = re.compile(r'<span data-mkdocstrings-identifier="(?P<identifier>[^"<>]*)">(?P<title>.*?)</span>')
"""
A regular expression to match unresolved Markdown references
A regular expression to match mkdocstrings' special reference markers
in the [`on_post_page` hook][mkdocstrings.plugin.MkdocstringsPlugin.on_post_page].
"""


class Placeholder:
"""
This class is used as a placeholder store.
Placeholders are random, unique strings that temporarily replace `<code>` nodes in an HTML tree.
class AutoRefInlineProcessor(ReferenceInlineProcessor):
def __init__(self, *args, **kwargs):
super().__init__(REFERENCE_RE, *args, **kwargs)

Why do we replace these nodes with such strings? Because we want to fix cross-references that were not
resolved during Markdown conversion, and we must never touch to what's inside of a code block.
To ease the process, instead of selecting nodes in the HTML tree with complex filters (I tried, believe me),
we simply "hide" the code nodes, and bulk-replace unresolved cross-references in the whole HTML text at once,
with a regular expression substitution. Once it's done, we bulk-replace code nodes back, with a regular expression
substitution again.
"""
# Code based on https://github.com/Python-Markdown/markdown/blob/8e7528fa5c98bf4652deb13206d6e6241d61630b/markdown/inlinepatterns.py#L780

def __init__(self, seed_length: int = 16) -> None:
"""
Initialize the object.
def handleMatch(self, m, data):
text, index, handled = self.getText(data, m.end(0))
if not handled:
return None, None, None

Arguments:
seed_length: Length of the seed.
"""
self._store: List[str] = []
self.seed = ""
self.seed_length = seed_length
self.set_seed()
identifier, end, handled = self.evalId(data, index, text)
if not handled:
return None, None, None

def store(self, value: str) -> str:
"""
Store a text under a unique ID, return that ID.
if re.search(r"[/ \x00-\x1f]", identifier):
# Do nothing if the matched reference contains:
# - a space, slash or control character (considered unintended);
# - specifically \x01 is used by Python-Markdown HTML stash when there's inline formatting,
# but references with Markdown formatting are not possible anyway.
return None, m.start(0), end

Arguments:
value: The text to store.
Returns:
The ID under which the text is stored.
"""
new_id = f"\x02{self.seed}{len(self._store)}\x03"
self._store.append(value)
return new_id

def set_seed(self) -> None:
"""Reset the seed in `self.seed` with a random string."""
alphabet = string.ascii_letters + string.digits
self.seed = "".join(random.choices(alphabet, k=self.seed_length))

def replace_code_tags(self, soup: BeautifulSoup) -> None:
"""
Recursively replace code nodes with navigable strings whose values are unique IDs.
return self.makeTag(identifier, text), m.start(0), end

Arguments:
soup: The root tag of a BeautifulSoup HTML tree.
"""
self._recursive_replace(soup)
def evalId(self, data, index, text):
m = self.RE_LINK.match(data, pos=index)
if not m:
return None, index, False
identifier = m.group(1) or text
end = m.end(0)
return identifier, end, True

def restore_code_tags(self, soup_str: str) -> str:
def makeTag(self, identifier, text):
"""
Restore code nodes previously replaced by unique placeholders.
Arguments:
soup_str: HTML text.
Returns:
The same HTML text with placeholders replaced by their respective original code nodes.
Creates a tag that can be matched by `AUTO_REF_RE`.
"""
return re.sub(rf"\x02{self.seed}(\d+)\x03", self._replace_id_with_value, soup_str)

def _replace_id_with_value(self, match):
return self._store[int(match.group(1))]

def _recursive_replace(self, tag):
if hasattr(tag, "contents"): # noqa: WPS421 (builtin function call, special cases only)
for index, child in enumerate(tag.contents):
if child.name == "code":
tag.contents[index] = NavigableString(self.store(str(child)))
else:
self._recursive_replace(child)
el = Element("span")
el.set("data-mkdocstrings-identifier", identifier)
el.text = text
return el


def relative_url(url_a: str, url_b: str) -> str:
Expand Down Expand Up @@ -131,9 +95,6 @@ def fix_ref(url_map: Dict[str, str], from_url: str, unmapped: List[str]) -> Call
In our context, we match Markdown references and replace them with HTML links.
When the matched reference's identifier contains a space or slash, we do nothing as we consider it
to be unintended.
When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer
`unmapped` list. It generally means the user is trying to cross-reference an object that was not collected
and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning.
Expand All @@ -149,24 +110,18 @@ def fix_ref(url_map: Dict[str, str], from_url: str, unmapped: List[str]) -> Call
"""

def inner(match: Match): # noqa: WPS430 (nested function, no other way than side-effecting the warnings)
groups = match.groupdict()
identifier = groups["identifier"]
title = groups["title"]

if title and not identifier:
identifier, title = title, identifier
identifier = match["identifier"]
title = match["title"]

try:
url = relative_url(from_url, url_map[identifier])
url = relative_url(from_url, url_map[html.unescape(identifier)])
except KeyError:
if " " not in identifier and "/" not in identifier:
unmapped.append(identifier)

if not title:
unmapped.append(identifier)
if title == identifier:
return f"[{identifier}][]"
return f"[{title}][{identifier}]"

return f'<a href="{url}">{title or identifier}</a>'
return f'<a href="{html.escape(url)}">{title}</a>'

return inner

Expand All @@ -188,16 +143,5 @@ def fix_refs(
The fixed HTML.
"""
unmapped = [] # type: ignore
if not AUTO_REF.search(html):
return html, unmapped

urls = "\n".join(set(url_map.values()))
placeholder = Placeholder()
while re.search(placeholder.seed, html) or placeholder.seed in urls:
placeholder.set_seed()

soup = BeautifulSoup(html, "html.parser")
placeholder.replace_code_tags(soup)
fixed_soup = AUTO_REF.sub(fix_ref(url_map, from_url, unmapped), str(soup))

return placeholder.restore_code_tags(fixed_soup), unmapped
html = AUTO_REF_RE.sub(fix_ref(url_map, from_url, unmapped), html)
return html, unmapped
3 changes: 3 additions & 0 deletions tests/fixtures/cross_reference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""
Link to [something.Else][].
"""
28 changes: 20 additions & 8 deletions tests/test_extension.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
"""Tests for the extension module."""

from markdown import Markdown

from mkdocstrings.extension import MkdocstringsExtension

_DEFAULT_CONFIG = {
"theme_name": "material",
"mdx": [],
"mdx_configs": {},
"mkdocstrings": {"default_handler": "python", "custom_templates": None, "watch": [], "handlers": {}},
}


def test_render_html_escaped_sequences():
"""Assert HTML-escaped sequences are correctly parsed as XML."""
config = {
"theme_name": "material",
"mdx": [],
"mdx_configs": {},
"mkdocstrings": {"default_handler": "python", "custom_templates": None, "watch": [], "handlers": {}},
}
md = Markdown(extensions=[MkdocstringsExtension(config)])
md = Markdown(extensions=[MkdocstringsExtension(_DEFAULT_CONFIG)])
md.convert("::: tests.fixtures.html_escaped_sequences")


def test_reference_inside_autodoc():
config = dict(_DEFAULT_CONFIG)
ext = MkdocstringsExtension(config)
config["mdx"].append(ext)

md = Markdown(extensions=[ext])

output = md.convert("::: tests.fixtures.cross_reference")
snippet = 'Link to <span data-mkdocstrings-identifier="something.Else">something.Else</span>.'
assert snippet in output
98 changes: 83 additions & 15 deletions tests/test_references.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Tests for the references module."""
import markdown
import pytest
from bs4 import BeautifulSoup

from mkdocstrings.references import Placeholder, relative_url
from mkdocstrings.extension import MkdocstringsExtension
from mkdocstrings.references import fix_refs, relative_url


@pytest.mark.parametrize(
Expand Down Expand Up @@ -37,19 +38,86 @@ def test_relative_url(current_url, to_url, href_url):
assert relative_url(current_url, to_url) == href_url


@pytest.mark.parametrize("html", ["foo<code>code content</code>4"])
def test_placeholder(html):
"""
Test the references "fixing" mechanism.
def run_references_test(url_map, source, output, unmapped=None, from_url="page.html"):
ext = MkdocstringsExtension({})
md = markdown.Markdown(extensions=[ext])
content = md.convert(source)
actual_output, actual_unmapped = fix_refs(content, from_url, url_map)
assert actual_output == output
assert actual_unmapped == (unmapped or [])

Arguments:
html: HTML contents in which to fix references.
"""
placeholder = Placeholder()

soup = BeautifulSoup(html, "html.parser")
placeholder.replace_code_tags(soup)
html_with_placeholders = str(soup).replace("code content", "should not be replaced")
def test_reference_implicit():
run_references_test(
url_map={"Foo": "foo.html#Foo"},
source="This [Foo][].",
output='<p>This <a href="foo.html#Foo">Foo</a>.</p>',
)


def test_reference_explicit_with_markdown_text():
run_references_test(
url_map={"Foo": "foo.html#Foo"},
source="This [`Foo`][Foo].",
output='<p>This <a href="foo.html#Foo"><code>Foo</code></a>.</p>',
)


def test_reference_with_punctuation():
run_references_test(
url_map={'Foo&"bar': 'foo.html#Foo&"bar'},
source='This [Foo&"bar][].',
output='<p>This <a href="foo.html#Foo&amp;&quot;bar">Foo&amp;"bar</a>.</p>',
)


def test_no_reference_with_space():
run_references_test(
url_map={"Foo bar": "foo.html#Foo bar"},
source="This `[Foo bar][]`.",
output="<p>This <code>[Foo bar][]</code>.</p>",
)


def test_no_reference_inside_markdown():
run_references_test(
url_map={"Foo": "foo.html#Foo"},
source="This `[Foo][]`.",
output="<p>This <code>[Foo][]</code>.</p>",
)


def test_missing_reference():
run_references_test(
url_map={"NotFoo": "foo.html#NotFoo"},
source="[Foo][]",
output="<p>[Foo][]</p>",
unmapped=["Foo"],
)


def test_missing_reference_with_markdown_text():
run_references_test(
url_map={"NotFoo": "foo.html#NotFoo"},
source="[`Foo`][Foo]",
output="<p>[<code>Foo</code>][Foo]</p>",
unmapped=["Foo"],
)


def test_missing_reference_with_markdown_id():
run_references_test(
url_map={"NotFoo": "foo.html#NotFoo"},
source="[Foo][*oh*]",
output="<p>[Foo][*oh*]</p>",
unmapped=["*oh*"],
)


html_restored = placeholder.restore_code_tags(html_with_placeholders)
assert html == html_restored
def test_missing_reference_with_markdown_implicit():
run_references_test(
url_map={"Foo": "foo.html#Foo"},
source="[`Foo`][]",
output="<p>[<code>Foo</code>][]</p>",
unmapped=[],
)

0 comments on commit 22a9e4b

Please sign in to comment.