Coverage for src/mkdocs_autorefs/references.py: 92.84%

263 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2025-01-10 16:32 +0100

1"""Cross-references module.""" 

2 

3from __future__ import annotations 

4 

5import logging 

6import re 

7import warnings 

8from abc import ABC, abstractmethod 

9from dataclasses import dataclass 

10from functools import lru_cache 

11from html import escape, unescape 

12from html.parser import HTMLParser 

13from typing import TYPE_CHECKING, Any, Callable, ClassVar 

14from urllib.parse import urlsplit 

15from xml.etree.ElementTree import Element 

16 

17from markdown.core import Markdown 

18from markdown.extensions import Extension 

19from markdown.extensions.toc import slugify 

20from markdown.inlinepatterns import REFERENCE_RE, ReferenceInlineProcessor 

21from markdown.treeprocessors import Treeprocessor 

22from markdown.util import HTML_PLACEHOLDER_RE, INLINE_PLACEHOLDER_RE 

23from markupsafe import Markup 

24 

25if TYPE_CHECKING: 

26 from collections.abc import Iterable 

27 from pathlib import Path 

28 from re import Match 

29 

30 from markdown import Markdown 

31 

32 from mkdocs_autorefs.plugin import AutorefsPlugin 

33 

34try: 

35 from mkdocs.plugins import get_plugin_logger 

36 

37 log = get_plugin_logger(__name__) 

38except ImportError: 

39 # TODO: remove once support for MkDocs <1.5 is dropped 

40 log = logging.getLogger(f"mkdocs.plugins.{__name__}") # type: ignore[assignment] 

41 

42 

43# YORE: Bump 2: Remove block. 

44def __getattr__(name: str) -> Any: 

45 if name == "AutoRefInlineProcessor": 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 warnings.warn("AutoRefInlineProcessor was renamed AutorefsInlineProcessor", DeprecationWarning, stacklevel=2) 

47 return AutorefsInlineProcessor 

48 raise AttributeError(f"module 'mkdocs_autorefs.references' has no attribute {name}") 

49 

50 

51_ATTR_VALUE = r'"[^"<>]+"|[^"<> ]+' # Possibly with double quotes around 

52 

53# YORE: Bump 2: Remove block. 

54AUTO_REF_RE = re.compile( 

55 rf"<span data-(?P<kind>autorefs-(?:identifier|optional|optional-hover))=(?P<identifier>{_ATTR_VALUE})" 

56 rf"(?: class=(?P<class>{_ATTR_VALUE}))?(?P<attrs> [^<>]+)?>(?P<title>.*?)</span>", 

57 flags=re.DOTALL, 

58) 

59"""Deprecated. Use [`AUTOREF_RE`][mkdocs_autorefs.references.AUTOREF_RE] instead.""" 

60 

61AUTOREF_RE = re.compile(r"<autoref (?P<attrs>.*?)>(?P<title>.*?)</autoref>", flags=re.DOTALL) 

62"""The autoref HTML tag regular expression. 

63 

64A regular expression to match mkdocs-autorefs' special reference markers 

65in the [`on_post_page` hook][mkdocs_autorefs.plugin.AutorefsPlugin.on_post_page]. 

66""" 

67 

68 

69class AutorefsHookInterface(ABC): 

70 """An interface for hooking into how AutoRef handles inline references.""" 

71 

72 @dataclass 

73 class Context: 

74 """The context around an auto-reference.""" 

75 

76 domain: str 

77 role: str 

78 origin: str 

79 filepath: str | Path 

80 lineno: int 

81 

82 def as_dict(self) -> dict[str, str]: 

83 """Convert the context to a dictionary of HTML attributes.""" 

84 return { 

85 "domain": self.domain, 

86 "role": self.role, 

87 "origin": self.origin, 

88 "filepath": str(self.filepath), 

89 "lineno": str(self.lineno), 

90 } 

91 

92 @abstractmethod 

93 def expand_identifier(self, identifier: str) -> str: 

94 """Expand an identifier in a given context. 

95 

96 Parameters: 

97 identifier: The identifier to expand. 

98 

99 Returns: 

100 The expanded identifier. 

101 """ 

102 raise NotImplementedError 

103 

104 @abstractmethod 

105 def get_context(self) -> AutorefsHookInterface.Context: 

106 """Get the current context. 

107 

108 Returns: 

109 The current context. 

110 """ 

111 raise NotImplementedError 

112 

113 

114class AutorefsInlineProcessor(ReferenceInlineProcessor): 

115 """A Markdown extension to handle inline references.""" 

116 

117 name: str = "mkdocs-autorefs" 

118 hook: AutorefsHookInterface | None = None 

119 

120 def __init__(self, *args: Any, **kwargs: Any) -> None: # noqa: D107 

121 super().__init__(REFERENCE_RE, *args, **kwargs) 

122 

123 # Code based on 

124 # https://github.com/Python-Markdown/markdown/blob/8e7528fa5c98bf4652deb13206d6e6241d61630b/markdown/inlinepatterns.py#L780 

125 def handleMatch(self, m: Match[str], data: str) -> tuple[Element | None, int | None, int | None]: # type: ignore[override] # noqa: N802 

126 """Handle an element that matched. 

127 

128 Arguments: 

129 m: The match object. 

130 data: The matched data. 

131 

132 Returns: 

133 A new element or a tuple. 

134 """ 

135 text, index, handled = self.getText(data, m.end(0)) 

136 if not handled: 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 return None, None, None 

138 

139 identifier, slug, end, handled = self._eval_id(data, index, text) 

140 if not handled or identifier is None: 

141 return None, None, None 

142 

143 if slug is None and re.search(r"[\x00-\x1f]", identifier): 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was never true

144 # Do nothing if the matched reference contains control characters (from 0 to 31 included). 

145 # Specifically `\x01` is used by Python-Markdown HTML stash when there's inline formatting, 

146 # but references with Markdown formatting are not possible anyway. 

147 return None, m.start(0), end 

148 

149 return self._make_tag(identifier, text, slug=slug), m.start(0), end 

150 

151 def _unstash(self, identifier: str) -> str: 

152 stashed_nodes: dict[str, Element | str] = self.md.treeprocessors["inline"].stashed_nodes # type: ignore[attr-defined] 

153 

154 def _repl(match: Match) -> str: 

155 el = stashed_nodes.get(match[1]) 

156 if isinstance(el, Element): 

157 return f"`{''.join(el.itertext())}`" 

158 if el == "\x0296\x03": 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true

159 return "`" 

160 return str(el) 

161 

162 return INLINE_PLACEHOLDER_RE.sub(_repl, identifier) 

163 

164 def _eval_id(self, data: str, index: int, text: str) -> tuple[str | None, str | None, int, bool]: 

165 """Evaluate the id portion of `[ref][id]`. 

166 

167 If `[ref][]` use `[ref]`. 

168 

169 Arguments: 

170 data: The data to evaluate. 

171 index: The starting position. 

172 text: The text to use when no identifier. 

173 

174 Returns: 

175 A tuple containing the identifier, its optional slug, its end position, and whether it matched. 

176 """ 

177 m = self.RE_LINK.match(data, pos=index) 

178 if not m: 

179 return None, None, index, False 

180 

181 if identifier := m.group(1): 

182 # An identifier was provided, match it exactly (later). 

183 slug = None 

184 else: 

185 # Only a title was provided, use it as identifier. 

186 identifier = text 

187 

188 # Catch single stash entries, like the result of [`Foo`][]. 

189 if match := INLINE_PLACEHOLDER_RE.fullmatch(identifier): 

190 stashed_nodes: dict[str, Element | str] = self.md.treeprocessors["inline"].stashed_nodes # type: ignore[attr-defined] 

191 el = stashed_nodes.get(match[1]) 

192 if isinstance(el, Element) and el.tag == "code": 192 ↛ 210line 192 didn't jump to line 210 because the condition on line 192 was always true

193 # The title was wrapped in backticks, we only keep the content, 

194 # and tell autorefs to match the identifier exactly. 

195 identifier = "".join(el.itertext()) 

196 slug = None 

197 # Special case: allow pymdownx.inlinehilite raw <code> snippets but strip them back to unhighlighted. 

198 if match := HTML_PLACEHOLDER_RE.fullmatch(identifier): 

199 stash_index = int(match.group(1)) 

200 html = self.md.htmlStash.rawHtmlBlocks[stash_index] 

201 identifier = Markup(html).striptags() 

202 self.md.htmlStash.rawHtmlBlocks[stash_index] = escape(identifier) 

203 

204 # In any other case, unstash the title and slugify it. 

205 # Examples: ``[`Foo` and `Bar`]``, `[The *Foo*][]`. 

206 else: 

207 identifier = self._unstash(identifier) 

208 slug = slugify(identifier, separator="-") 

209 

210 end = m.end(0) 

211 return identifier, slug, end, True 

212 

213 def _make_tag(self, identifier: str, text: str, *, slug: str | None = None) -> Element: 

214 """Create a tag that can be matched by `AUTO_REF_RE`. 

215 

216 Arguments: 

217 identifier: The identifier to use in the HTML property. 

218 text: The text to use in the HTML tag. 

219 

220 Returns: 

221 A new element. 

222 """ 

223 el = Element("autoref") 

224 if self.hook: 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true

225 identifier = self.hook.expand_identifier(identifier) 

226 el.attrib.update(self.hook.get_context().as_dict()) 

227 el.set("identifier", identifier) 

228 el.text = text 

229 if slug: 

230 el.attrib["slug"] = slug 

231 return el 

232 

233 

234def relative_url(url_a: str, url_b: str) -> str: 

235 """Compute the relative path from URL A to URL B. 

236 

237 Arguments: 

238 url_a: URL A. 

239 url_b: URL B. 

240 

241 Returns: 

242 The relative URL to go from A to B. 

243 """ 

244 parts_a = url_a.split("/") 

245 url_b, anchor = url_b.split("#", 1) 

246 parts_b = url_b.split("/") 

247 

248 # remove common left parts 

249 while parts_a and parts_b and parts_a[0] == parts_b[0]: 

250 parts_a.pop(0) 

251 parts_b.pop(0) 

252 

253 # go up as many times as remaining a parts' depth 

254 levels = len(parts_a) - 1 

255 parts_relative = [".."] * levels + parts_b 

256 relative = "/".join(parts_relative) 

257 return f"{relative}#{anchor}" 

258 

259 

260# YORE: Bump 2: Remove block. 

261def _legacy_fix_ref( 

262 url_mapper: Callable[[str], str], 

263 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]], 

264) -> Callable: 

265 """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub). 

266 

267 In our context, we match Markdown references and replace them with HTML links. 

268 

269 When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer 

270 `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected 

271 and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning. 

272 

273 Arguments: 

274 url_mapper: A callable that gets an object's site URL by its identifier, 

275 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][]. 

276 unmapped: A list to store unmapped identifiers. 

277 

278 Returns: 

279 The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects) 

280 and returning the replacement strings. 

281 """ 

282 

283 def inner(match: Match) -> str: 

284 identifier = match["identifier"].strip('"') 

285 title = match["title"] 

286 kind = match["kind"] 

287 attrs = match["attrs"] or "" 

288 classes = (match["class"] or "").strip('"').split() 

289 

290 try: 

291 url = url_mapper(unescape(identifier)) 

292 except KeyError: 

293 if kind == "autorefs-optional": 

294 return title 

295 if kind == "autorefs-optional-hover": 

296 return f'<span title="{identifier}">{title}</span>' 

297 unmapped.append((identifier, None)) 

298 if title == identifier: 298 ↛ 299line 298 didn't jump to line 299 because the condition on line 298 was never true

299 return f"[{identifier}][]" 

300 return f"[{title}][{identifier}]" 

301 

302 warnings.warn( 

303 "autorefs `span` elements are deprecated in favor of `autoref` elements: " 

304 f'`<span data-autorefs-identifier="{identifier}">...</span>` becomes `<autoref identifer="{identifier}">...</autoref>`', 

305 DeprecationWarning, 

306 stacklevel=1, 

307 ) 

308 parsed = urlsplit(url) 

309 external = parsed.scheme or parsed.netloc 

310 classes = ["autorefs", "autorefs-external" if external else "autorefs-internal", *classes] 

311 class_attr = " ".join(classes) 

312 if kind == "autorefs-optional-hover": 

313 return f'<a class="{class_attr}" title="{identifier}" href="{escape(url)}"{attrs}>{title}</a>' 

314 return f'<a class="{class_attr}" href="{escape(url)}"{attrs}>{title}</a>' 

315 

316 return inner 

317 

318 

319class _AutorefsAttrs(dict): 

320 _handled_attrs: ClassVar[set[str]] = { 

321 "identifier", 

322 "optional", 

323 "hover", 

324 "class", 

325 "domain", 

326 "role", 

327 "origin", 

328 "filepath", 

329 "lineno", 

330 "slug", 

331 } 

332 

333 @property 

334 def context(self) -> AutorefsHookInterface.Context | None: 

335 try: 

336 return AutorefsHookInterface.Context( 

337 domain=self["domain"], 

338 role=self["role"], 

339 origin=self["origin"], 

340 filepath=self["filepath"], 

341 lineno=int(self["lineno"]), 

342 ) 

343 except KeyError: 

344 return None 

345 

346 @property 

347 def remaining(self) -> str: 

348 return " ".join(k if v is None else f'{k}="{v}"' for k, v in self.items() if k not in self._handled_attrs) 

349 

350 

351class _HTMLAttrsParser(HTMLParser): 

352 def __init__(self): 

353 super().__init__() 

354 self.attrs = {} 

355 

356 def parse(self, html: str) -> _AutorefsAttrs: 

357 self.attrs.clear() 

358 self.feed(html) 

359 return _AutorefsAttrs(self.attrs) 

360 

361 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: # noqa: ARG002 

362 self.attrs.update(attrs) 

363 

364 

365_html_attrs_parser = _HTMLAttrsParser() 

366 

367 

368def _find_url(identifiers: Iterable[str], url_mapper: Callable[[str], str]) -> str: 

369 for identifier in identifiers: 

370 try: 

371 return url_mapper(identifier) 

372 except KeyError: 

373 pass 

374 raise KeyError(f"None of the identifiers {identifiers} were found") 

375 

376 

377def fix_ref( 

378 url_mapper: Callable[[str], str], 

379 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]], 

380) -> Callable: 

381 """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub). 

382 

383 In our context, we match Markdown references and replace them with HTML links. 

384 

385 When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer 

386 `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected 

387 and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning. 

388 

389 Arguments: 

390 url_mapper: A callable that gets an object's site URL by its identifier, 

391 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][]. 

392 unmapped: A list to store unmapped identifiers. 

393 

394 Returns: 

395 The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects) 

396 and returning the replacement strings. 

397 """ 

398 

399 def inner(match: Match) -> str: 

400 title = match["title"] 

401 attrs = _html_attrs_parser.parse(f"<a {match['attrs']}>") 

402 identifier: str = attrs["identifier"] 

403 slug = attrs.get("slug", None) 

404 optional = "optional" in attrs 

405 hover = "hover" in attrs 

406 

407 identifiers = (identifier, slug) if slug else (identifier,) 

408 

409 try: 

410 url = _find_url(identifiers, url_mapper) 

411 except KeyError: 

412 if optional: 

413 log.debug("Unresolved optional cross-reference: %s", identifier) 

414 if hover: 

415 return f'<span title="{identifier}">{title}</span>' 

416 return title 

417 unmapped.append((identifier, attrs.context)) 

418 if title == identifier: 

419 return f"[{identifier}][]" 

420 if title == f"<code>{identifier}</code>" and not slug: 

421 return f"[<code>{identifier}</code>][]" 

422 return f"[{title}][{identifier}]" 

423 

424 parsed = urlsplit(url) 

425 external = parsed.scheme or parsed.netloc 

426 classes = (attrs.get("class") or "").strip().split() 

427 classes = ["autorefs", "autorefs-external" if external else "autorefs-internal", *classes] 

428 class_attr = " ".join(classes) 

429 if remaining := attrs.remaining: 

430 remaining = f" {remaining}" 

431 if optional and hover: 

432 return f'<a class="{class_attr}" title="{identifier}" href="{escape(url)}"{remaining}>{title}</a>' 

433 return f'<a class="{class_attr}" href="{escape(url)}"{remaining}>{title}</a>' 

434 

435 return inner 

436 

437 

438# YORE: Bump 2: Replace `, *, _legacy_refs: bool = True` with `` within line. 

439def fix_refs( 

440 html: str, 

441 url_mapper: Callable[[str], str], 

442 *, 

443 _legacy_refs: bool = True, 

444) -> tuple[str, list[tuple[str, AutorefsHookInterface.Context | None]]]: 

445 """Fix all references in the given HTML text. 

446 

447 Arguments: 

448 html: The text to fix. 

449 url_mapper: A callable that gets an object's site URL by its identifier, 

450 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][]. 

451 

452 Returns: 

453 The fixed HTML, and a list of unmapped identifiers (string and optional context). 

454 """ 

455 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]] = [] 

456 html = AUTOREF_RE.sub(fix_ref(url_mapper, unmapped), html) 

457 

458 # YORE: Bump 2: Remove block. 

459 if _legacy_refs: 

460 html = AUTO_REF_RE.sub(_legacy_fix_ref(url_mapper, unmapped), html) 

461 

462 return html, unmapped 

463 

464 

465class AnchorScannerTreeProcessor(Treeprocessor): 

466 """Tree processor to scan and register HTML anchors.""" 

467 

468 name: str = "mkdocs-autorefs-anchors-scanner" 

469 _htags: ClassVar[set[str]] = {"h1", "h2", "h3", "h4", "h5", "h6"} 

470 

471 def __init__(self, plugin: AutorefsPlugin, md: Markdown | None = None) -> None: 

472 """Initialize the tree processor. 

473 

474 Parameters: 

475 plugin: A reference to the autorefs plugin, to use its `register_anchor` method. 

476 """ 

477 super().__init__(md) 

478 self.plugin = plugin 

479 

480 def run(self, root: Element) -> None: # noqa: D102 

481 if self.plugin.current_page is not None: 481 ↛ exitline 481 didn't return from function 'run' because the condition on line 481 was always true

482 pending_anchors = _PendingAnchors(self.plugin, self.plugin.current_page) 

483 self._scan_anchors(root, pending_anchors) 

484 pending_anchors.flush() 

485 

486 def _scan_anchors(self, parent: Element, pending_anchors: _PendingAnchors) -> None: 

487 for el in parent: 

488 if el.tag == "a": 

489 # We found an anchor. Record its id if it has one. 

490 if anchor_id := el.get("id"): 490 ↛ 494line 490 didn't jump to line 494 because the condition on line 490 was always true

491 pending_anchors.append(anchor_id) 

492 # If the element has text or a link, it's not an alias. 

493 # Non-whitespace text after the element interrupts the chain, aliases can't apply. 

494 if el.text or el.get("href") or (el.tail and el.tail.strip()): 

495 pending_anchors.flush() 

496 

497 elif el.tag == "p": 

498 # A `p` tag is a no-op for our purposes, just recurse into it in the context 

499 # of the current collection of anchors. 

500 self._scan_anchors(el, pending_anchors) 

501 # Non-whitespace text after the element interrupts the chain, aliases can't apply. 

502 if el.tail and el.tail.strip(): 502 ↛ 503line 502 didn't jump to line 503 because the condition on line 502 was never true

503 pending_anchors.flush() 

504 

505 elif el.tag in self._htags: 

506 # If the element is a heading, that turns the pending anchors into aliases. 

507 pending_anchors.flush(el.get("id")) 

508 

509 else: 

510 # But if it's some other interruption, flush anchors anyway as non-aliases. 

511 pending_anchors.flush() 

512 # Recurse into sub-elements, in a *separate* context. 

513 self.run(el) 

514 

515 

516class _PendingAnchors: 

517 """A collection of HTML anchors that may or may not become aliased to an upcoming heading.""" 

518 

519 def __init__(self, plugin: AutorefsPlugin, current_page: str): 

520 self.plugin = plugin 

521 self.current_page = current_page 

522 self.anchors: list[str] = [] 

523 

524 def append(self, anchor: str) -> None: 

525 self.anchors.append(anchor) 

526 

527 def flush(self, alias_to: str | None = None) -> None: 

528 for anchor in self.anchors: 

529 self.plugin.register_anchor(self.current_page, anchor, alias_to, primary=True) 

530 self.anchors.clear() 

531 

532 

533@lru_cache 

534def _log_enabling_markdown_anchors() -> None: 

535 log.debug("Enabling Markdown anchors feature") 

536 

537 

538class AutorefsExtension(Extension): 

539 """Markdown extension that transforms unresolved references into auto-references. 

540 

541 Auto-references are then resolved later by the MkDocs plugin. 

542 

543 This extension also scans Markdown anchors (`[](){#some-id}`) 

544 to register them with the MkDocs plugin. 

545 """ 

546 

547 def __init__( 

548 self, 

549 plugin: AutorefsPlugin | None = None, 

550 **kwargs: Any, 

551 ) -> None: 

552 """Initialize the Markdown extension. 

553 

554 Parameters: 

555 plugin: An optional reference to the autorefs plugin (to pass it to the anchor scanner tree processor). 

556 **kwargs: Keyword arguments passed to the [base constructor][markdown.extensions.Extension]. 

557 """ 

558 super().__init__(**kwargs) 

559 self.plugin = plugin 

560 

561 def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 (casing: parent method's name) 

562 """Register the extension. 

563 

564 Add an instance of our [`AutorefsInlineProcessor`][mkdocs_autorefs.references.AutorefsInlineProcessor] to the Markdown parser. 

565 Also optionally add an instance of our [`AnchorScannerTreeProcessor`][mkdocs_autorefs.references.AnchorScannerTreeProcessor] 

566 to the Markdown parser if a reference to the autorefs plugin was passed to this extension. 

567 

568 Arguments: 

569 md: A `markdown.Markdown` instance. 

570 """ 

571 md.inlinePatterns.register( 

572 AutorefsInlineProcessor(md), 

573 AutorefsInlineProcessor.name, 

574 priority=168, # Right after markdown.inlinepatterns.ReferenceInlineProcessor 

575 ) 

576 if self.plugin is not None and self.plugin.scan_toc and "attr_list" in md.treeprocessors: 

577 _log_enabling_markdown_anchors() 

578 md.treeprocessors.register( 

579 AnchorScannerTreeProcessor(self.plugin, md), 

580 AnchorScannerTreeProcessor.name, 

581 priority=0, 

582 )