Coverage for src/mkdocs_autorefs/references.py: 97.11%
235 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-01 20:28 +0200
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-01 20:28 +0200
1"""Cross-references module."""
3from __future__ import annotations
5import logging
6import re
7import warnings
8from abc import ABC, abstractmethod
9from dataclasses import dataclass
10from functools import lru_cache
11from html import escape, unescape
12from html.parser import HTMLParser
13from typing import TYPE_CHECKING, Any, Callable, ClassVar, Match
14from urllib.parse import urlsplit
15from xml.etree.ElementTree import Element
17import markupsafe
18from markdown.core import Markdown
19from markdown.extensions import Extension
20from markdown.inlinepatterns import REFERENCE_RE, ReferenceInlineProcessor
21from markdown.treeprocessors import Treeprocessor
22from markdown.util import HTML_PLACEHOLDER_RE, INLINE_PLACEHOLDER_RE
24if TYPE_CHECKING:
25 from pathlib import Path
27 from markdown import Markdown
29 from mkdocs_autorefs.plugin import AutorefsPlugin
31try:
32 from mkdocs.plugins import get_plugin_logger
34 log = get_plugin_logger(__name__)
35except ImportError:
36 # TODO: remove once support for MkDocs <1.5 is dropped
37 log = logging.getLogger(f"mkdocs.plugins.{__name__}") # type: ignore[assignment]
40# YORE: Bump 2: Remove block.
41def __getattr__(name: str) -> Any:
42 if name == "AutoRefInlineProcessor": 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 warnings.warn("AutoRefInlineProcessor was renamed AutorefsInlineProcessor", DeprecationWarning, stacklevel=2)
44 return AutorefsInlineProcessor
45 raise AttributeError(f"module 'mkdocs_autorefs.references' has no attribute {name}")
48_ATTR_VALUE = r'"[^"<>]+"|[^"<> ]+' # Possibly with double quotes around
50# YORE: Bump 2: Remove block.
51AUTO_REF_RE = re.compile(
52 rf"<span data-(?P<kind>autorefs-(?:identifier|optional|optional-hover))=(?P<identifier>{_ATTR_VALUE})"
53 rf"(?: class=(?P<class>{_ATTR_VALUE}))?(?P<attrs> [^<>]+)?>(?P<title>.*?)</span>",
54 flags=re.DOTALL,
55)
56"""Deprecated. Use [`AUTOREF_RE`][mkdocs_autorefs.references.AUTOREF_RE] instead."""
58AUTOREF_RE = re.compile(r"<autoref (?P<attrs>.*?)>(?P<title>.*?)</autoref>", flags=re.DOTALL)
59"""The autoref HTML tag regular expression.
61A regular expression to match mkdocs-autorefs' special reference markers
62in the [`on_post_page` hook][mkdocs_autorefs.plugin.AutorefsPlugin.on_post_page].
63"""
66class AutorefsHookInterface(ABC):
67 """An interface for hooking into how AutoRef handles inline references."""
69 @dataclass
70 class Context:
71 """The context around an auto-reference."""
73 domain: str
74 role: str
75 origin: str
76 filepath: str | Path
77 lineno: int
79 def as_dict(self) -> dict[str, str]:
80 """Convert the context to a dictionary of HTML attributes."""
81 return {
82 "domain": self.domain,
83 "role": self.role,
84 "origin": self.origin,
85 "filepath": str(self.filepath),
86 "lineno": str(self.lineno),
87 }
89 @abstractmethod
90 def expand_identifier(self, identifier: str) -> str:
91 """Expand an identifier in a given context.
93 Parameters:
94 identifier: The identifier to expand.
96 Returns:
97 The expanded identifier.
98 """
99 raise NotImplementedError
101 @abstractmethod
102 def get_context(self) -> AutorefsHookInterface.Context:
103 """Get the current context.
105 Returns:
106 The current context.
107 """
108 raise NotImplementedError
111class AutorefsInlineProcessor(ReferenceInlineProcessor):
112 """A Markdown extension to handle inline references."""
114 name: str = "mkdocs-autorefs"
115 hook: AutorefsHookInterface | None = None
117 def __init__(self, *args: Any, **kwargs: Any) -> None: # noqa: D107
118 super().__init__(REFERENCE_RE, *args, **kwargs)
120 # Code based on
121 # https://github.com/Python-Markdown/markdown/blob/8e7528fa5c98bf4652deb13206d6e6241d61630b/markdown/inlinepatterns.py#L780
123 def handleMatch(self, m: Match[str], data: str) -> tuple[Element | None, int | None, int | None]: # type: ignore[override] # noqa: N802
124 """Handle an element that matched.
126 Arguments:
127 m: The match object.
128 data: The matched data.
130 Returns:
131 A new element or a tuple.
132 """
133 text, index, handled = self.getText(data, m.end(0))
134 if not handled:
135 return None, None, None
137 identifier, end, handled = self.evalId(data, index, text)
138 if not handled or identifier is None:
139 return None, None, None
141 if re.search(r"[\x00-\x1f]", identifier):
142 # Do nothing if the matched reference contains control characters (from 0 to 31 included).
143 # Specifically `\x01` is used by Python-Markdown HTML stash when there's inline formatting,
144 # but references with Markdown formatting are not possible anyway.
145 return None, m.start(0), end
147 return self._make_tag(identifier, text), m.start(0), end
149 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]: # noqa: N802 (parent's casing)
150 """Evaluate the id portion of `[ref][id]`.
152 If `[ref][]` use `[ref]`.
154 Arguments:
155 data: The data to evaluate.
156 index: The starting position.
157 text: The text to use when no identifier.
159 Returns:
160 A tuple containing the identifier, its end position, and whether it matched.
161 """
162 m = self.RE_LINK.match(data, pos=index)
163 if not m:
164 return None, index, False
166 identifier = m.group(1)
167 if not identifier:
168 identifier = text
169 # Allow the entire content to be one placeholder, with the intent of catching things like [`Foo`][].
170 # It doesn't catch [*Foo*][] though, just due to the priority order.
171 # https://github.com/Python-Markdown/markdown/blob/1858c1b601ead62ed49646ae0d99298f41b1a271/markdown/inlinepatterns.py#L78
172 if match := INLINE_PLACEHOLDER_RE.fullmatch(identifier):
173 stashed_nodes: dict[str, Element | str] = self.md.treeprocessors["inline"].stashed_nodes # type: ignore[attr-defined]
174 el = stashed_nodes.get(match[1])
175 if isinstance(el, Element) and el.tag == "code": 175 ↛ 184line 175 didn't jump to line 184 because the condition on line 175 was always true
176 identifier = "".join(el.itertext())
177 # Special case: allow pymdownx.inlinehilite raw <code> snippets but strip them back to unhighlighted.
178 if match := HTML_PLACEHOLDER_RE.fullmatch(identifier):
179 stash_index = int(match.group(1))
180 html = self.md.htmlStash.rawHtmlBlocks[stash_index]
181 identifier = markupsafe.Markup(html).striptags()
182 self.md.htmlStash.rawHtmlBlocks[stash_index] = escape(identifier)
184 end = m.end(0)
185 return identifier, end, True
187 def _make_tag(self, identifier: str, text: str) -> Element:
188 """Create a tag that can be matched by `AUTO_REF_RE`.
190 Arguments:
191 identifier: The identifier to use in the HTML property.
192 text: The text to use in the HTML tag.
194 Returns:
195 A new element.
196 """
197 el = Element("autoref")
198 if self.hook: 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true
199 identifier = self.hook.expand_identifier(identifier)
200 el.attrib.update(self.hook.get_context().as_dict())
201 el.set("identifier", identifier)
202 el.text = text
203 return el
206def relative_url(url_a: str, url_b: str) -> str:
207 """Compute the relative path from URL A to URL B.
209 Arguments:
210 url_a: URL A.
211 url_b: URL B.
213 Returns:
214 The relative URL to go from A to B.
215 """
216 parts_a = url_a.split("/")
217 url_b, anchor = url_b.split("#", 1)
218 parts_b = url_b.split("/")
220 # remove common left parts
221 while parts_a and parts_b and parts_a[0] == parts_b[0]:
222 parts_a.pop(0)
223 parts_b.pop(0)
225 # go up as many times as remaining a parts' depth
226 levels = len(parts_a) - 1
227 parts_relative = [".."] * levels + parts_b
228 relative = "/".join(parts_relative)
229 return f"{relative}#{anchor}"
232# YORE: Bump 2: Remove block.
233def _legacy_fix_ref(
234 url_mapper: Callable[[str], str],
235 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]],
236) -> Callable:
237 """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub).
239 In our context, we match Markdown references and replace them with HTML links.
241 When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer
242 `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected
243 and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning.
245 Arguments:
246 url_mapper: A callable that gets an object's site URL by its identifier,
247 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][].
248 unmapped: A list to store unmapped identifiers.
250 Returns:
251 The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects)
252 and returning the replacement strings.
253 """
255 def inner(match: Match) -> str:
256 identifier = match["identifier"].strip('"')
257 title = match["title"]
258 kind = match["kind"]
259 attrs = match["attrs"] or ""
260 classes = (match["class"] or "").strip('"').split()
262 try:
263 url = url_mapper(unescape(identifier))
264 except KeyError:
265 if kind == "autorefs-optional":
266 return title
267 if kind == "autorefs-optional-hover":
268 return f'<span title="{identifier}">{title}</span>'
269 unmapped.append((identifier, None))
270 if title == identifier: 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true
271 return f"[{identifier}][]"
272 return f"[{title}][{identifier}]"
274 warnings.warn(
275 "autorefs `span` elements are deprecated in favor of `autoref` elements: "
276 f'`<span data-autorefs-identifier="{identifier}">...</span>` becomes `<autoref identifer="{identifier}">...</autoref>`',
277 DeprecationWarning,
278 stacklevel=1,
279 )
280 parsed = urlsplit(url)
281 external = parsed.scheme or parsed.netloc
282 classes = ["autorefs", "autorefs-external" if external else "autorefs-internal", *classes]
283 class_attr = " ".join(classes)
284 if kind == "autorefs-optional-hover":
285 return f'<a class="{class_attr}" title="{identifier}" href="{escape(url)}"{attrs}>{title}</a>'
286 return f'<a class="{class_attr}" href="{escape(url)}"{attrs}>{title}</a>'
288 return inner
291class _AutorefsAttrs(dict):
292 _handled_attrs: ClassVar[set[str]] = {
293 "identifier",
294 "optional",
295 "hover",
296 "class",
297 "domain",
298 "role",
299 "origin",
300 "filepath",
301 "lineno",
302 }
304 @property
305 def context(self) -> AutorefsHookInterface.Context | None:
306 try:
307 return AutorefsHookInterface.Context(
308 domain=self["domain"],
309 role=self["role"],
310 origin=self["origin"],
311 filepath=self["filepath"],
312 lineno=int(self["lineno"]),
313 )
314 except KeyError:
315 return None
317 @property
318 def remaining(self) -> str:
319 return " ".join(k if v is None else f'{k}="{v}"' for k, v in self.items() if k not in self._handled_attrs)
322class _HTMLAttrsParser(HTMLParser):
323 def __init__(self):
324 super().__init__()
325 self.attrs = {}
327 def parse(self, html: str) -> _AutorefsAttrs:
328 self.attrs.clear()
329 self.feed(html)
330 return _AutorefsAttrs(self.attrs)
332 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: # noqa: ARG002
333 self.attrs.update(attrs)
336_html_attrs_parser = _HTMLAttrsParser()
339def fix_ref(
340 url_mapper: Callable[[str], str],
341 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]],
342) -> Callable:
343 """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub).
345 In our context, we match Markdown references and replace them with HTML links.
347 When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer
348 `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected
349 and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning.
351 Arguments:
352 url_mapper: A callable that gets an object's site URL by its identifier,
353 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][].
354 unmapped: A list to store unmapped identifiers.
356 Returns:
357 The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects)
358 and returning the replacement strings.
359 """
361 def inner(match: Match) -> str:
362 title = match["title"]
363 attrs = _html_attrs_parser.parse(f"<a {match['attrs']}>")
364 identifier: str = attrs["identifier"]
365 optional = "optional" in attrs
366 hover = "hover" in attrs
368 try:
369 url = url_mapper(unescape(identifier))
370 except KeyError:
371 if optional:
372 if hover:
373 return f'<span title="{identifier}">{title}</span>'
374 return title
375 unmapped.append((identifier, attrs.context))
376 if title == identifier:
377 return f"[{identifier}][]"
378 return f"[{title}][{identifier}]"
380 parsed = urlsplit(url)
381 external = parsed.scheme or parsed.netloc
382 classes = (attrs.get("class") or "").strip().split()
383 classes = ["autorefs", "autorefs-external" if external else "autorefs-internal", *classes]
384 class_attr = " ".join(classes)
385 if remaining := attrs.remaining:
386 remaining = f" {remaining}"
387 if optional and hover:
388 return f'<a class="{class_attr}" title="{identifier}" href="{escape(url)}"{remaining}>{title}</a>'
389 return f'<a class="{class_attr}" href="{escape(url)}"{remaining}>{title}</a>'
391 return inner
394# YORE: Bump 2: Replace `, *, _legacy_refs: bool = True` with `` within line.
395def fix_refs(
396 html: str,
397 url_mapper: Callable[[str], str],
398 *,
399 _legacy_refs: bool = True,
400) -> tuple[str, list[tuple[str, AutorefsHookInterface.Context | None]]]:
401 """Fix all references in the given HTML text.
403 Arguments:
404 html: The text to fix.
405 url_mapper: A callable that gets an object's site URL by its identifier,
406 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][].
408 Returns:
409 The fixed HTML, and a list of unmapped identifiers (string and optional context).
410 """
411 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]] = []
412 html = AUTOREF_RE.sub(fix_ref(url_mapper, unmapped), html)
414 # YORE: Bump 2: Remove block.
415 if _legacy_refs: 415 ↛ 418line 415 didn't jump to line 418 because the condition on line 415 was always true
416 html = AUTO_REF_RE.sub(_legacy_fix_ref(url_mapper, unmapped), html)
418 return html, unmapped
421class AnchorScannerTreeProcessor(Treeprocessor):
422 """Tree processor to scan and register HTML anchors."""
424 name: str = "mkdocs-autorefs-anchors-scanner"
425 _htags: ClassVar[set[str]] = {"h1", "h2", "h3", "h4", "h5", "h6"}
427 def __init__(self, plugin: AutorefsPlugin, md: Markdown | None = None) -> None:
428 """Initialize the tree processor.
430 Parameters:
431 plugin: A reference to the autorefs plugin, to use its `register_anchor` method.
432 """
433 super().__init__(md)
434 self.plugin = plugin
436 def run(self, root: Element) -> None: # noqa: D102
437 if self.plugin.current_page is not None: 437 ↛ exitline 437 didn't return from function 'run' because the condition on line 437 was always true
438 pending_anchors = _PendingAnchors(self.plugin, self.plugin.current_page)
439 self._scan_anchors(root, pending_anchors)
440 pending_anchors.flush()
442 def _scan_anchors(self, parent: Element, pending_anchors: _PendingAnchors) -> None:
443 for el in parent:
444 if el.tag == "a":
445 # We found an anchor. Record its id if it has one.
446 if anchor_id := el.get("id"): 446 ↛ 450line 446 didn't jump to line 450 because the condition on line 446 was always true
447 pending_anchors.append(anchor_id)
448 # If the element has text or a link, it's not an alias.
449 # Non-whitespace text after the element interrupts the chain, aliases can't apply.
450 if el.text or el.get("href") or (el.tail and el.tail.strip()):
451 pending_anchors.flush()
453 elif el.tag == "p":
454 # A `p` tag is a no-op for our purposes, just recurse into it in the context
455 # of the current collection of anchors.
456 self._scan_anchors(el, pending_anchors)
457 # Non-whitespace text after the element interrupts the chain, aliases can't apply.
458 if el.tail and el.tail.strip():
459 pending_anchors.flush()
461 elif el.tag in self._htags:
462 # If the element is a heading, that turns the pending anchors into aliases.
463 pending_anchors.flush(el.get("id"))
465 else:
466 # But if it's some other interruption, flush anchors anyway as non-aliases.
467 pending_anchors.flush()
468 # Recurse into sub-elements, in a *separate* context.
469 self.run(el)
472class _PendingAnchors:
473 """A collection of HTML anchors that may or may not become aliased to an upcoming heading."""
475 def __init__(self, plugin: AutorefsPlugin, current_page: str):
476 self.plugin = plugin
477 self.current_page = current_page
478 self.anchors: list[str] = []
480 def append(self, anchor: str) -> None:
481 self.anchors.append(anchor)
483 def flush(self, alias_to: str | None = None) -> None:
484 for anchor in self.anchors:
485 self.plugin.register_anchor(self.current_page, anchor, alias_to)
486 self.anchors.clear()
489@lru_cache
490def _log_enabling_markdown_anchors() -> None:
491 log.debug("Enabling Markdown anchors feature")
494class AutorefsExtension(Extension):
495 """Markdown extension that transforms unresolved references into auto-references.
497 Auto-references are then resolved later by the MkDocs plugin.
499 This extension also scans Markdown anchors (`[](){#some-id}`)
500 to register them with the MkDocs plugin.
501 """
503 def __init__(
504 self,
505 plugin: AutorefsPlugin | None = None,
506 **kwargs: Any,
507 ) -> None:
508 """Initialize the Markdown extension.
510 Parameters:
511 plugin: An optional reference to the autorefs plugin (to pass it to the anchor scanner tree processor).
512 **kwargs: Keyword arguments passed to the [base constructor][markdown.extensions.Extension].
513 """
514 super().__init__(**kwargs)
515 self.plugin = plugin
517 def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 (casing: parent method's name)
518 """Register the extension.
520 Add an instance of our [`AutorefsInlineProcessor`][mkdocs_autorefs.references.AutorefsInlineProcessor] to the Markdown parser.
521 Also optionally add an instance of our [`AnchorScannerTreeProcessor`][mkdocs_autorefs.references.AnchorScannerTreeProcessor]
522 to the Markdown parser if a reference to the autorefs plugin was passed to this extension.
524 Arguments:
525 md: A `markdown.Markdown` instance.
526 """
527 md.inlinePatterns.register(
528 AutorefsInlineProcessor(md),
529 AutorefsInlineProcessor.name,
530 priority=168, # Right after markdown.inlinepatterns.ReferenceInlineProcessor
531 )
532 if self.plugin is not None and self.plugin.scan_toc and "attr_list" in md.treeprocessors:
533 _log_enabling_markdown_anchors()
534 md.treeprocessors.register(
535 AnchorScannerTreeProcessor(self.plugin, md),
536 AnchorScannerTreeProcessor.name,
537 priority=0,
538 )