perf: Rework autorefs replacement to not re-parse the final HTML

Define a "transport representation" for autorefs: <span data-mkdocstrings-identifier="SomeIdentifier">SomeHTML</span> First a native Markdown extension translates from the usual `[SomeMarkdown][SomeIdentifier]` to the above, and then the post-process replacement mechanism (which is kept in the same place as before) doesn't need to be so careful and complicated, it just indiscriminately replaces such exact strings. This is a very big boost in performance and I think is more future-proof. Other mkdocstrings handlers are also free to use this mechanism: define whatever syntax for autorefs that they need and then insert this exact HTML themselves, for the postprocessing to pick up later. It used to be possible to insert the square-brackets Markdown before, but that was very fragile. Issue #187: #187 PR #188: #188
mkdocstrings · Dec 6, 2020 · 22a9e4b · 22a9e4b
1 parent ac156d2
commit 22a9e4b
Show file tree

Hide file tree

Showing 7 changed files with 156 additions and 127 deletions.
diff --git a/CREDITS.md b/CREDITS.md
@@ -11,7 +11,6 @@ These projects were used to build `mkdocstrings`. **Thank you!**
 
 ### Direct dependencies
 [`autoflake`](https://github.com/myint/autoflake) |
-[`beautifulsoup4`](http://www.crummy.com/software/BeautifulSoup/bs4/) |
 [`black`](https://github.com/psf/black) |
 [`coverage`](https://github.com/nedbat/coveragepy) |
 [`failprint`](https://github.com/pawamoy/failprint) |
@@ -127,7 +126,6 @@ These projects were used to build `mkdocstrings`. **Thank you!**
 [`smmap`](https://github.com/gitpython-developers/smmap) |
 [`sniffio`](https://github.com/python-trio/sniffio) |
 [`snowballstemmer`](https://github.com/snowballstem/snowball) |
-[`soupsieve`](https://github.com/facelessuser/soupsieve) |
 [`stevedore`](https://docs.openstack.org/stevedore/latest/) |
 [`termcolor`](http://pypi.python.org/pypi/termcolor) |
 [`testfixtures`](https://github.com/Simplistix/testfixtures) |

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,7 @@ include = [
 
 [tool.poetry.dependencies]
 python = "^3.6"
-beautifulsoup4 = "^4.8.2"
+Markdown = "^3.3"
 mkdocs = "^1.1"
 pymdown-extensions = ">=6.3, <9.0"
 pytkdocs = ">=0.2.0, <0.10.0"

diff --git a/src/mkdocstrings/extension.py b/src/mkdocstrings/extension.py
@@ -37,6 +37,7 @@
 
 from mkdocstrings.handlers.base import CollectionError, get_handler
 from mkdocstrings.loggers import get_logger
+from mkdocstrings.references import AutoRefInlineProcessor
 
 log = get_logger(__name__)
 
@@ -304,6 +305,7 @@ class MkdocstringsExtension(Extension):
     """
 
     blockprocessor_priority = 75  # Right before markdown.blockprocessors.HashHeaderProcessor
+    inlineprocessor_priority = 168  # Right after markdown.inlinepatterns.ReferenceInlineProcessor
 
     def __init__(self, config: dict, **kwargs) -> None:
         """
@@ -329,3 +331,5 @@ def extendMarkdown(self, md: Markdown) -> None:  # noqa: N802 (casing: parent me
         md.registerExtension(self)
         processor = AutoDocProcessor(md.parser, md, self._config)
         md.parser.blockprocessors.register(processor, "mkdocstrings", self.blockprocessor_priority)
+        ref_processor = AutoRefInlineProcessor(md)
+        md.inlinePatterns.register(ref_processor, "mkdocstrings", self.inlineprocessor_priority)
diff --git a/src/mkdocstrings/references.py b/src/mkdocstrings/references.py
@@ -1,95 +1,59 @@
 """Cross-references module."""
 
-import random
+import html
 import re
-import string
-from typing import Callable, Dict, List, Match, Pattern, Tuple
+from typing import Callable, Dict, List, Match, Tuple
+from xml.etree.ElementTree import Element
 
-from bs4 import BeautifulSoup, NavigableString
+from markdown.inlinepatterns import REFERENCE_RE, ReferenceInlineProcessor
 
-AUTO_REF: Pattern = re.compile(r"\[(?P<title>.+?)\]\[(?P<identifier>.*?)\]")
+AUTO_REF_RE = re.compile(r'<span data-mkdocstrings-identifier="(?P<identifier>[^"<>]*)">(?P<title>.*?)</span>')
 """
-A regular expression to match unresolved Markdown references
+A regular expression to match mkdocstrings' special reference markers
 in the [`on_post_page` hook][mkdocstrings.plugin.MkdocstringsPlugin.on_post_page].
 """
 
 
-class Placeholder:
-    """
-    This class is used as a placeholder store.
-
-    Placeholders are random, unique strings that temporarily replace `<code>` nodes in an HTML tree.
+class AutoRefInlineProcessor(ReferenceInlineProcessor):
+    def __init__(self, *args, **kwargs):
+        super().__init__(REFERENCE_RE, *args, **kwargs)
 
-    Why do we replace these nodes with such strings? Because we want to fix cross-references that were not
-    resolved during Markdown conversion, and we must never touch to what's inside of a code block.
-    To ease the process, instead of selecting nodes in the HTML tree with complex filters (I tried, believe me),
-    we simply "hide" the code nodes, and bulk-replace unresolved cross-references in the whole HTML text at once,
-    with a regular expression substitution. Once it's done, we bulk-replace code nodes back, with a regular expression
-    substitution again.
-    """
+    # Code based on https://github.com/Python-Markdown/markdown/blob/8e7528fa5c98bf4652deb13206d6e6241d61630b/markdown/inlinepatterns.py#L780
 
-    def __init__(self, seed_length: int = 16) -> None:
-        """
-        Initialize the object.
+    def handleMatch(self, m, data):
+        text, index, handled = self.getText(data, m.end(0))
+        if not handled:
+            return None, None, None
 
-        Arguments:
-            seed_length: Length of the seed.
-        """
-        self._store: List[str] = []
-        self.seed = ""
-        self.seed_length = seed_length
-        self.set_seed()
+        identifier, end, handled = self.evalId(data, index, text)
+        if not handled:
+            return None, None, None
 
-    def store(self, value: str) -> str:
-        """
-        Store a text under a unique ID, return that ID.
+        if re.search(r"[/ \x00-\x1f]", identifier):
+            # Do nothing if the matched reference contains:
+            # - a space, slash or control character (considered unintended);
+            # - specifically \x01 is used by Python-Markdown HTML stash when there's inline formatting,
+            #   but references with Markdown formatting are not possible anyway.
+            return None, m.start(0), end
 
-        Arguments:
-            value: The text to store.
-
-        Returns:
-            The ID under which the text is stored.
-        """
-        new_id = f"\x02{self.seed}{len(self._store)}\x03"
-        self._store.append(value)
-        return new_id
-
-    def set_seed(self) -> None:
-        """Reset the seed in `self.seed` with a random string."""
-        alphabet = string.ascii_letters + string.digits
-        self.seed = "".join(random.choices(alphabet, k=self.seed_length))
-
-    def replace_code_tags(self, soup: BeautifulSoup) -> None:
-        """
-        Recursively replace code nodes with navigable strings whose values are unique IDs.
+        return self.makeTag(identifier, text), m.start(0), end
 
-        Arguments:
-            soup: The root tag of a BeautifulSoup HTML tree.
-        """
-        self._recursive_replace(soup)
+    def evalId(self, data, index, text):
+        m = self.RE_LINK.match(data, pos=index)
+        if not m:
+            return None, index, False
+        identifier = m.group(1) or text
+        end = m.end(0)
+        return identifier, end, True
 
-    def restore_code_tags(self, soup_str: str) -> str:
+    def makeTag(self, identifier, text):
         """
-        Restore code nodes previously replaced by unique placeholders.
-
-        Arguments:
-            soup_str: HTML text.
-
-        Returns:
-            The same HTML text with placeholders replaced by their respective original code nodes.
+        Creates a tag that can be matched by `AUTO_REF_RE`.
         """
-        return re.sub(rf"\x02{self.seed}(\d+)\x03", self._replace_id_with_value, soup_str)
-
-    def _replace_id_with_value(self, match):
-        return self._store[int(match.group(1))]
-
-    def _recursive_replace(self, tag):
-        if hasattr(tag, "contents"):  # noqa: WPS421 (builtin function call, special cases only)
-            for index, child in enumerate(tag.contents):
-                if child.name == "code":
-                    tag.contents[index] = NavigableString(self.store(str(child)))
-                else:
-                    self._recursive_replace(child)
+        el = Element("span")
+        el.set("data-mkdocstrings-identifier", identifier)
+        el.text = text
+        return el
 
 
 def relative_url(url_a: str, url_b: str) -> str:
@@ -131,9 +95,6 @@ def fix_ref(url_map: Dict[str, str], from_url: str, unmapped: List[str]) -> Call
 
     In our context, we match Markdown references and replace them with HTML links.
 
-    When the matched reference's identifier contains a space or slash, we do nothing as we consider it
-    to be unintended.
-
     When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer
     `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected
     and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning.
@@ -149,24 +110,18 @@ def fix_ref(url_map: Dict[str, str], from_url: str, unmapped: List[str]) -> Call
     """
 
     def inner(match: Match):  # noqa: WPS430 (nested function, no other way than side-effecting the warnings)
-        groups = match.groupdict()
-        identifier = groups["identifier"]
-        title = groups["title"]
-
-        if title and not identifier:
-            identifier, title = title, identifier
+        identifier = match["identifier"]
+        title = match["title"]
 
         try:
-            url = relative_url(from_url, url_map[identifier])
+            url = relative_url(from_url, url_map[html.unescape(identifier)])
         except KeyError:
-            if " " not in identifier and "/" not in identifier:
-                unmapped.append(identifier)
-
-            if not title:
+            unmapped.append(identifier)
+            if title == identifier:
                 return f"[{identifier}][]"
             return f"[{title}][{identifier}]"
 
-        return f'<a href="{url}">{title or identifier}</a>'
+        return f'<a href="{html.escape(url)}">{title}</a>'
 
     return inner
 
@@ -188,16 +143,5 @@ def fix_refs(
         The fixed HTML.
     """
     unmapped = []  # type: ignore
-    if not AUTO_REF.search(html):
-        return html, unmapped
-
-    urls = "\n".join(set(url_map.values()))
-    placeholder = Placeholder()
-    while re.search(placeholder.seed, html) or placeholder.seed in urls:
-        placeholder.set_seed()
-
-    soup = BeautifulSoup(html, "html.parser")
-    placeholder.replace_code_tags(soup)
-    fixed_soup = AUTO_REF.sub(fix_ref(url_map, from_url, unmapped), str(soup))
-
-    return placeholder.restore_code_tags(fixed_soup), unmapped
+    html = AUTO_REF_RE.sub(fix_ref(url_map, from_url, unmapped), html)
+    return html, unmapped
diff --git a/tests/fixtures/cross_reference.py b/tests/fixtures/cross_reference.py
@@ -0,0 +1,3 @@
+"""
+Link to [something.Else][].
+"""
diff --git a/tests/test_extension.py b/tests/test_extension.py
@@ -1,17 +1,29 @@
 """Tests for the extension module."""
-
 from markdown import Markdown
 
 from mkdocstrings.extension import MkdocstringsExtension
 
+_DEFAULT_CONFIG = {
+    "theme_name": "material",
+    "mdx": [],
+    "mdx_configs": {},
+    "mkdocstrings": {"default_handler": "python", "custom_templates": None, "watch": [], "handlers": {}},
+}
+
 
 def test_render_html_escaped_sequences():
     """Assert HTML-escaped sequences are correctly parsed as XML."""
-    config = {
-        "theme_name": "material",
-        "mdx": [],
-        "mdx_configs": {},
-        "mkdocstrings": {"default_handler": "python", "custom_templates": None, "watch": [], "handlers": {}},
-    }
-    md = Markdown(extensions=[MkdocstringsExtension(config)])
+    md = Markdown(extensions=[MkdocstringsExtension(_DEFAULT_CONFIG)])
     md.convert("::: tests.fixtures.html_escaped_sequences")
+
+
+def test_reference_inside_autodoc():
+    config = dict(_DEFAULT_CONFIG)
+    ext = MkdocstringsExtension(config)
+    config["mdx"].append(ext)
+
+    md = Markdown(extensions=[ext])
+
+    output = md.convert("::: tests.fixtures.cross_reference")
+    snippet = 'Link to <span data-mkdocstrings-identifier="something.Else">something.Else</span>.'
+    assert snippet in output
diff --git a/tests/test_references.py b/tests/test_references.py
@@ -1,8 +1,9 @@
 """Tests for the references module."""
+import markdown
 import pytest
-from bs4 import BeautifulSoup
 
-from mkdocstrings.references import Placeholder, relative_url
+from mkdocstrings.extension import MkdocstringsExtension
+from mkdocstrings.references import fix_refs, relative_url
 
 
 @pytest.mark.parametrize(
@@ -37,19 +38,86 @@ def test_relative_url(current_url, to_url, href_url):
     assert relative_url(current_url, to_url) == href_url
 
 
-@pytest.mark.parametrize("html", ["foo<code>code content</code>4"])
-def test_placeholder(html):
-    """
-    Test the references "fixing" mechanism.
+def run_references_test(url_map, source, output, unmapped=None, from_url="page.html"):
+    ext = MkdocstringsExtension({})
+    md = markdown.Markdown(extensions=[ext])
+    content = md.convert(source)
+    actual_output, actual_unmapped = fix_refs(content, from_url, url_map)
+    assert actual_output == output
+    assert actual_unmapped == (unmapped or [])
 
-    Arguments:
-        html: HTML contents in which to fix references.
-    """
-    placeholder = Placeholder()
 
-    soup = BeautifulSoup(html, "html.parser")
-    placeholder.replace_code_tags(soup)
-    html_with_placeholders = str(soup).replace("code content", "should not be replaced")
+def test_reference_implicit():
+    run_references_test(
+        url_map={"Foo": "foo.html#Foo"},
+        source="This [Foo][].",
+        output='<p>This <a href="foo.html#Foo">Foo</a>.</p>',
+    )
+
+
+def test_reference_explicit_with_markdown_text():
+    run_references_test(
+        url_map={"Foo": "foo.html#Foo"},
+        source="This [`Foo`][Foo].",
+        output='<p>This <a href="foo.html#Foo"><code>Foo</code></a>.</p>',
+    )
+
+
+def test_reference_with_punctuation():
+    run_references_test(
+        url_map={'Foo&"bar': 'foo.html#Foo&"bar'},
+        source='This [Foo&"bar][].',
+        output='<p>This <a href="foo.html#Foo&amp;&quot;bar">Foo&amp;"bar</a>.</p>',
+    )
+
+
+def test_no_reference_with_space():
+    run_references_test(
+        url_map={"Foo bar": "foo.html#Foo bar"},
+        source="This `[Foo bar][]`.",
+        output="<p>This <code>[Foo bar][]</code>.</p>",
+    )
+
+
+def test_no_reference_inside_markdown():
+    run_references_test(
+        url_map={"Foo": "foo.html#Foo"},
+        source="This `[Foo][]`.",
+        output="<p>This <code>[Foo][]</code>.</p>",
+    )
+
+
+def test_missing_reference():
+    run_references_test(
+        url_map={"NotFoo": "foo.html#NotFoo"},
+        source="[Foo][]",
+        output="<p>[Foo][]</p>",
+        unmapped=["Foo"],
+    )
+
+
+def test_missing_reference_with_markdown_text():
+    run_references_test(
+        url_map={"NotFoo": "foo.html#NotFoo"},
+        source="[`Foo`][Foo]",
+        output="<p>[<code>Foo</code>][Foo]</p>",
+        unmapped=["Foo"],
+    )
+
+
+def test_missing_reference_with_markdown_id():
+    run_references_test(
+        url_map={"NotFoo": "foo.html#NotFoo"},
+        source="[Foo][*oh*]",
+        output="<p>[Foo][*oh*]</p>",
+        unmapped=["*oh*"],
+    )
+
 
-    html_restored = placeholder.restore_code_tags(html_with_placeholders)
-    assert html == html_restored
+def test_missing_reference_with_markdown_implicit():
+    run_references_test(
+        url_map={"Foo": "foo.html#Foo"},
+        source="[`Foo`][]",
+        output="<p>[<code>Foo</code>][]</p>",
+        unmapped=[],
+    )