Coverage for src/mkdocs_autorefs/references.py: 92.84%
263 statements
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-10 16:32 +0100
« prev ^ index » next coverage.py v7.6.10, created at 2025-01-10 16:32 +0100
1"""Cross-references module."""
3from __future__ import annotations
5import logging
6import re
7import warnings
8from abc import ABC, abstractmethod
9from dataclasses import dataclass
10from functools import lru_cache
11from html import escape, unescape
12from html.parser import HTMLParser
13from typing import TYPE_CHECKING, Any, Callable, ClassVar
14from urllib.parse import urlsplit
15from xml.etree.ElementTree import Element
17from markdown.core import Markdown
18from markdown.extensions import Extension
19from markdown.extensions.toc import slugify
20from markdown.inlinepatterns import REFERENCE_RE, ReferenceInlineProcessor
21from markdown.treeprocessors import Treeprocessor
22from markdown.util import HTML_PLACEHOLDER_RE, INLINE_PLACEHOLDER_RE
23from markupsafe import Markup
25if TYPE_CHECKING:
26 from collections.abc import Iterable
27 from pathlib import Path
28 from re import Match
30 from markdown import Markdown
32 from mkdocs_autorefs.plugin import AutorefsPlugin
34try:
35 from mkdocs.plugins import get_plugin_logger
37 log = get_plugin_logger(__name__)
38except ImportError:
39 # TODO: remove once support for MkDocs <1.5 is dropped
40 log = logging.getLogger(f"mkdocs.plugins.{__name__}") # type: ignore[assignment]
43# YORE: Bump 2: Remove block.
44def __getattr__(name: str) -> Any:
45 if name == "AutoRefInlineProcessor": 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true
46 warnings.warn("AutoRefInlineProcessor was renamed AutorefsInlineProcessor", DeprecationWarning, stacklevel=2)
47 return AutorefsInlineProcessor
48 raise AttributeError(f"module 'mkdocs_autorefs.references' has no attribute {name}")
51_ATTR_VALUE = r'"[^"<>]+"|[^"<> ]+' # Possibly with double quotes around
53# YORE: Bump 2: Remove block.
54AUTO_REF_RE = re.compile(
55 rf"<span data-(?P<kind>autorefs-(?:identifier|optional|optional-hover))=(?P<identifier>{_ATTR_VALUE})"
56 rf"(?: class=(?P<class>{_ATTR_VALUE}))?(?P<attrs> [^<>]+)?>(?P<title>.*?)</span>",
57 flags=re.DOTALL,
58)
59"""Deprecated. Use [`AUTOREF_RE`][mkdocs_autorefs.references.AUTOREF_RE] instead."""
61AUTOREF_RE = re.compile(r"<autoref (?P<attrs>.*?)>(?P<title>.*?)</autoref>", flags=re.DOTALL)
62"""The autoref HTML tag regular expression.
64A regular expression to match mkdocs-autorefs' special reference markers
65in the [`on_post_page` hook][mkdocs_autorefs.plugin.AutorefsPlugin.on_post_page].
66"""
69class AutorefsHookInterface(ABC):
70 """An interface for hooking into how AutoRef handles inline references."""
72 @dataclass
73 class Context:
74 """The context around an auto-reference."""
76 domain: str
77 role: str
78 origin: str
79 filepath: str | Path
80 lineno: int
82 def as_dict(self) -> dict[str, str]:
83 """Convert the context to a dictionary of HTML attributes."""
84 return {
85 "domain": self.domain,
86 "role": self.role,
87 "origin": self.origin,
88 "filepath": str(self.filepath),
89 "lineno": str(self.lineno),
90 }
92 @abstractmethod
93 def expand_identifier(self, identifier: str) -> str:
94 """Expand an identifier in a given context.
96 Parameters:
97 identifier: The identifier to expand.
99 Returns:
100 The expanded identifier.
101 """
102 raise NotImplementedError
104 @abstractmethod
105 def get_context(self) -> AutorefsHookInterface.Context:
106 """Get the current context.
108 Returns:
109 The current context.
110 """
111 raise NotImplementedError
114class AutorefsInlineProcessor(ReferenceInlineProcessor):
115 """A Markdown extension to handle inline references."""
117 name: str = "mkdocs-autorefs"
118 hook: AutorefsHookInterface | None = None
120 def __init__(self, *args: Any, **kwargs: Any) -> None: # noqa: D107
121 super().__init__(REFERENCE_RE, *args, **kwargs)
123 # Code based on
124 # https://github.com/Python-Markdown/markdown/blob/8e7528fa5c98bf4652deb13206d6e6241d61630b/markdown/inlinepatterns.py#L780
125 def handleMatch(self, m: Match[str], data: str) -> tuple[Element | None, int | None, int | None]: # type: ignore[override] # noqa: N802
126 """Handle an element that matched.
128 Arguments:
129 m: The match object.
130 data: The matched data.
132 Returns:
133 A new element or a tuple.
134 """
135 text, index, handled = self.getText(data, m.end(0))
136 if not handled: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 return None, None, None
139 identifier, slug, end, handled = self._eval_id(data, index, text)
140 if not handled or identifier is None:
141 return None, None, None
143 if slug is None and re.search(r"[\x00-\x1f]", identifier): 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was never true
144 # Do nothing if the matched reference contains control characters (from 0 to 31 included).
145 # Specifically `\x01` is used by Python-Markdown HTML stash when there's inline formatting,
146 # but references with Markdown formatting are not possible anyway.
147 return None, m.start(0), end
149 return self._make_tag(identifier, text, slug=slug), m.start(0), end
151 def _unstash(self, identifier: str) -> str:
152 stashed_nodes: dict[str, Element | str] = self.md.treeprocessors["inline"].stashed_nodes # type: ignore[attr-defined]
154 def _repl(match: Match) -> str:
155 el = stashed_nodes.get(match[1])
156 if isinstance(el, Element):
157 return f"`{''.join(el.itertext())}`"
158 if el == "\x0296\x03": 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true
159 return "`"
160 return str(el)
162 return INLINE_PLACEHOLDER_RE.sub(_repl, identifier)
164 def _eval_id(self, data: str, index: int, text: str) -> tuple[str | None, str | None, int, bool]:
165 """Evaluate the id portion of `[ref][id]`.
167 If `[ref][]` use `[ref]`.
169 Arguments:
170 data: The data to evaluate.
171 index: The starting position.
172 text: The text to use when no identifier.
174 Returns:
175 A tuple containing the identifier, its optional slug, its end position, and whether it matched.
176 """
177 m = self.RE_LINK.match(data, pos=index)
178 if not m:
179 return None, None, index, False
181 if identifier := m.group(1):
182 # An identifier was provided, match it exactly (later).
183 slug = None
184 else:
185 # Only a title was provided, use it as identifier.
186 identifier = text
188 # Catch single stash entries, like the result of [`Foo`][].
189 if match := INLINE_PLACEHOLDER_RE.fullmatch(identifier):
190 stashed_nodes: dict[str, Element | str] = self.md.treeprocessors["inline"].stashed_nodes # type: ignore[attr-defined]
191 el = stashed_nodes.get(match[1])
192 if isinstance(el, Element) and el.tag == "code": 192 ↛ 210line 192 didn't jump to line 210 because the condition on line 192 was always true
193 # The title was wrapped in backticks, we only keep the content,
194 # and tell autorefs to match the identifier exactly.
195 identifier = "".join(el.itertext())
196 slug = None
197 # Special case: allow pymdownx.inlinehilite raw <code> snippets but strip them back to unhighlighted.
198 if match := HTML_PLACEHOLDER_RE.fullmatch(identifier):
199 stash_index = int(match.group(1))
200 html = self.md.htmlStash.rawHtmlBlocks[stash_index]
201 identifier = Markup(html).striptags()
202 self.md.htmlStash.rawHtmlBlocks[stash_index] = escape(identifier)
204 # In any other case, unstash the title and slugify it.
205 # Examples: ``[`Foo` and `Bar`]``, `[The *Foo*][]`.
206 else:
207 identifier = self._unstash(identifier)
208 slug = slugify(identifier, separator="-")
210 end = m.end(0)
211 return identifier, slug, end, True
213 def _make_tag(self, identifier: str, text: str, *, slug: str | None = None) -> Element:
214 """Create a tag that can be matched by `AUTO_REF_RE`.
216 Arguments:
217 identifier: The identifier to use in the HTML property.
218 text: The text to use in the HTML tag.
220 Returns:
221 A new element.
222 """
223 el = Element("autoref")
224 if self.hook: 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true
225 identifier = self.hook.expand_identifier(identifier)
226 el.attrib.update(self.hook.get_context().as_dict())
227 el.set("identifier", identifier)
228 el.text = text
229 if slug:
230 el.attrib["slug"] = slug
231 return el
234def relative_url(url_a: str, url_b: str) -> str:
235 """Compute the relative path from URL A to URL B.
237 Arguments:
238 url_a: URL A.
239 url_b: URL B.
241 Returns:
242 The relative URL to go from A to B.
243 """
244 parts_a = url_a.split("/")
245 url_b, anchor = url_b.split("#", 1)
246 parts_b = url_b.split("/")
248 # remove common left parts
249 while parts_a and parts_b and parts_a[0] == parts_b[0]:
250 parts_a.pop(0)
251 parts_b.pop(0)
253 # go up as many times as remaining a parts' depth
254 levels = len(parts_a) - 1
255 parts_relative = [".."] * levels + parts_b
256 relative = "/".join(parts_relative)
257 return f"{relative}#{anchor}"
260# YORE: Bump 2: Remove block.
261def _legacy_fix_ref(
262 url_mapper: Callable[[str], str],
263 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]],
264) -> Callable:
265 """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub).
267 In our context, we match Markdown references and replace them with HTML links.
269 When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer
270 `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected
271 and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning.
273 Arguments:
274 url_mapper: A callable that gets an object's site URL by its identifier,
275 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][].
276 unmapped: A list to store unmapped identifiers.
278 Returns:
279 The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects)
280 and returning the replacement strings.
281 """
283 def inner(match: Match) -> str:
284 identifier = match["identifier"].strip('"')
285 title = match["title"]
286 kind = match["kind"]
287 attrs = match["attrs"] or ""
288 classes = (match["class"] or "").strip('"').split()
290 try:
291 url = url_mapper(unescape(identifier))
292 except KeyError:
293 if kind == "autorefs-optional":
294 return title
295 if kind == "autorefs-optional-hover":
296 return f'<span title="{identifier}">{title}</span>'
297 unmapped.append((identifier, None))
298 if title == identifier: 298 ↛ 299line 298 didn't jump to line 299 because the condition on line 298 was never true
299 return f"[{identifier}][]"
300 return f"[{title}][{identifier}]"
302 warnings.warn(
303 "autorefs `span` elements are deprecated in favor of `autoref` elements: "
304 f'`<span data-autorefs-identifier="{identifier}">...</span>` becomes `<autoref identifer="{identifier}">...</autoref>`',
305 DeprecationWarning,
306 stacklevel=1,
307 )
308 parsed = urlsplit(url)
309 external = parsed.scheme or parsed.netloc
310 classes = ["autorefs", "autorefs-external" if external else "autorefs-internal", *classes]
311 class_attr = " ".join(classes)
312 if kind == "autorefs-optional-hover":
313 return f'<a class="{class_attr}" title="{identifier}" href="{escape(url)}"{attrs}>{title}</a>'
314 return f'<a class="{class_attr}" href="{escape(url)}"{attrs}>{title}</a>'
316 return inner
319class _AutorefsAttrs(dict):
320 _handled_attrs: ClassVar[set[str]] = {
321 "identifier",
322 "optional",
323 "hover",
324 "class",
325 "domain",
326 "role",
327 "origin",
328 "filepath",
329 "lineno",
330 "slug",
331 }
333 @property
334 def context(self) -> AutorefsHookInterface.Context | None:
335 try:
336 return AutorefsHookInterface.Context(
337 domain=self["domain"],
338 role=self["role"],
339 origin=self["origin"],
340 filepath=self["filepath"],
341 lineno=int(self["lineno"]),
342 )
343 except KeyError:
344 return None
346 @property
347 def remaining(self) -> str:
348 return " ".join(k if v is None else f'{k}="{v}"' for k, v in self.items() if k not in self._handled_attrs)
351class _HTMLAttrsParser(HTMLParser):
352 def __init__(self):
353 super().__init__()
354 self.attrs = {}
356 def parse(self, html: str) -> _AutorefsAttrs:
357 self.attrs.clear()
358 self.feed(html)
359 return _AutorefsAttrs(self.attrs)
361 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: # noqa: ARG002
362 self.attrs.update(attrs)
365_html_attrs_parser = _HTMLAttrsParser()
368def _find_url(identifiers: Iterable[str], url_mapper: Callable[[str], str]) -> str:
369 for identifier in identifiers:
370 try:
371 return url_mapper(identifier)
372 except KeyError:
373 pass
374 raise KeyError(f"None of the identifiers {identifiers} were found")
377def fix_ref(
378 url_mapper: Callable[[str], str],
379 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]],
380) -> Callable:
381 """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub).
383 In our context, we match Markdown references and replace them with HTML links.
385 When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer
386 `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected
387 and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning.
389 Arguments:
390 url_mapper: A callable that gets an object's site URL by its identifier,
391 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][].
392 unmapped: A list to store unmapped identifiers.
394 Returns:
395 The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects)
396 and returning the replacement strings.
397 """
399 def inner(match: Match) -> str:
400 title = match["title"]
401 attrs = _html_attrs_parser.parse(f"<a {match['attrs']}>")
402 identifier: str = attrs["identifier"]
403 slug = attrs.get("slug", None)
404 optional = "optional" in attrs
405 hover = "hover" in attrs
407 identifiers = (identifier, slug) if slug else (identifier,)
409 try:
410 url = _find_url(identifiers, url_mapper)
411 except KeyError:
412 if optional:
413 log.debug("Unresolved optional cross-reference: %s", identifier)
414 if hover:
415 return f'<span title="{identifier}">{title}</span>'
416 return title
417 unmapped.append((identifier, attrs.context))
418 if title == identifier:
419 return f"[{identifier}][]"
420 if title == f"<code>{identifier}</code>" and not slug:
421 return f"[<code>{identifier}</code>][]"
422 return f"[{title}][{identifier}]"
424 parsed = urlsplit(url)
425 external = parsed.scheme or parsed.netloc
426 classes = (attrs.get("class") or "").strip().split()
427 classes = ["autorefs", "autorefs-external" if external else "autorefs-internal", *classes]
428 class_attr = " ".join(classes)
429 if remaining := attrs.remaining:
430 remaining = f" {remaining}"
431 if optional and hover:
432 return f'<a class="{class_attr}" title="{identifier}" href="{escape(url)}"{remaining}>{title}</a>'
433 return f'<a class="{class_attr}" href="{escape(url)}"{remaining}>{title}</a>'
435 return inner
438# YORE: Bump 2: Replace `, *, _legacy_refs: bool = True` with `` within line.
439def fix_refs(
440 html: str,
441 url_mapper: Callable[[str], str],
442 *,
443 _legacy_refs: bool = True,
444) -> tuple[str, list[tuple[str, AutorefsHookInterface.Context | None]]]:
445 """Fix all references in the given HTML text.
447 Arguments:
448 html: The text to fix.
449 url_mapper: A callable that gets an object's site URL by its identifier,
450 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][].
452 Returns:
453 The fixed HTML, and a list of unmapped identifiers (string and optional context).
454 """
455 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]] = []
456 html = AUTOREF_RE.sub(fix_ref(url_mapper, unmapped), html)
458 # YORE: Bump 2: Remove block.
459 if _legacy_refs:
460 html = AUTO_REF_RE.sub(_legacy_fix_ref(url_mapper, unmapped), html)
462 return html, unmapped
465class AnchorScannerTreeProcessor(Treeprocessor):
466 """Tree processor to scan and register HTML anchors."""
468 name: str = "mkdocs-autorefs-anchors-scanner"
469 _htags: ClassVar[set[str]] = {"h1", "h2", "h3", "h4", "h5", "h6"}
471 def __init__(self, plugin: AutorefsPlugin, md: Markdown | None = None) -> None:
472 """Initialize the tree processor.
474 Parameters:
475 plugin: A reference to the autorefs plugin, to use its `register_anchor` method.
476 """
477 super().__init__(md)
478 self.plugin = plugin
480 def run(self, root: Element) -> None: # noqa: D102
481 if self.plugin.current_page is not None: 481 ↛ exitline 481 didn't return from function 'run' because the condition on line 481 was always true
482 pending_anchors = _PendingAnchors(self.plugin, self.plugin.current_page)
483 self._scan_anchors(root, pending_anchors)
484 pending_anchors.flush()
486 def _scan_anchors(self, parent: Element, pending_anchors: _PendingAnchors) -> None:
487 for el in parent:
488 if el.tag == "a":
489 # We found an anchor. Record its id if it has one.
490 if anchor_id := el.get("id"): 490 ↛ 494line 490 didn't jump to line 494 because the condition on line 490 was always true
491 pending_anchors.append(anchor_id)
492 # If the element has text or a link, it's not an alias.
493 # Non-whitespace text after the element interrupts the chain, aliases can't apply.
494 if el.text or el.get("href") or (el.tail and el.tail.strip()):
495 pending_anchors.flush()
497 elif el.tag == "p":
498 # A `p` tag is a no-op for our purposes, just recurse into it in the context
499 # of the current collection of anchors.
500 self._scan_anchors(el, pending_anchors)
501 # Non-whitespace text after the element interrupts the chain, aliases can't apply.
502 if el.tail and el.tail.strip(): 502 ↛ 503line 502 didn't jump to line 503 because the condition on line 502 was never true
503 pending_anchors.flush()
505 elif el.tag in self._htags:
506 # If the element is a heading, that turns the pending anchors into aliases.
507 pending_anchors.flush(el.get("id"))
509 else:
510 # But if it's some other interruption, flush anchors anyway as non-aliases.
511 pending_anchors.flush()
512 # Recurse into sub-elements, in a *separate* context.
513 self.run(el)
516class _PendingAnchors:
517 """A collection of HTML anchors that may or may not become aliased to an upcoming heading."""
519 def __init__(self, plugin: AutorefsPlugin, current_page: str):
520 self.plugin = plugin
521 self.current_page = current_page
522 self.anchors: list[str] = []
524 def append(self, anchor: str) -> None:
525 self.anchors.append(anchor)
527 def flush(self, alias_to: str | None = None) -> None:
528 for anchor in self.anchors:
529 self.plugin.register_anchor(self.current_page, anchor, alias_to, primary=True)
530 self.anchors.clear()
533@lru_cache
534def _log_enabling_markdown_anchors() -> None:
535 log.debug("Enabling Markdown anchors feature")
538class AutorefsExtension(Extension):
539 """Markdown extension that transforms unresolved references into auto-references.
541 Auto-references are then resolved later by the MkDocs plugin.
543 This extension also scans Markdown anchors (`[](){#some-id}`)
544 to register them with the MkDocs plugin.
545 """
547 def __init__(
548 self,
549 plugin: AutorefsPlugin | None = None,
550 **kwargs: Any,
551 ) -> None:
552 """Initialize the Markdown extension.
554 Parameters:
555 plugin: An optional reference to the autorefs plugin (to pass it to the anchor scanner tree processor).
556 **kwargs: Keyword arguments passed to the [base constructor][markdown.extensions.Extension].
557 """
558 super().__init__(**kwargs)
559 self.plugin = plugin
561 def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 (casing: parent method's name)
562 """Register the extension.
564 Add an instance of our [`AutorefsInlineProcessor`][mkdocs_autorefs.references.AutorefsInlineProcessor] to the Markdown parser.
565 Also optionally add an instance of our [`AnchorScannerTreeProcessor`][mkdocs_autorefs.references.AnchorScannerTreeProcessor]
566 to the Markdown parser if a reference to the autorefs plugin was passed to this extension.
568 Arguments:
569 md: A `markdown.Markdown` instance.
570 """
571 md.inlinePatterns.register(
572 AutorefsInlineProcessor(md),
573 AutorefsInlineProcessor.name,
574 priority=168, # Right after markdown.inlinepatterns.ReferenceInlineProcessor
575 )
576 if self.plugin is not None and self.plugin.scan_toc and "attr_list" in md.treeprocessors:
577 _log_enabling_markdown_anchors()
578 md.treeprocessors.register(
579 AnchorScannerTreeProcessor(self.plugin, md),
580 AnchorScannerTreeProcessor.name,
581 priority=0,
582 )