Coverage for src/mkdocs_autorefs/references.py: 97.11%

235 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-09-01 20:28 +0200

1"""Cross-references module.""" 

2 

3from __future__ import annotations 

4 

5import logging 

6import re 

7import warnings 

8from abc import ABC, abstractmethod 

9from dataclasses import dataclass 

10from functools import lru_cache 

11from html import escape, unescape 

12from html.parser import HTMLParser 

13from typing import TYPE_CHECKING, Any, Callable, ClassVar, Match 

14from urllib.parse import urlsplit 

15from xml.etree.ElementTree import Element 

16 

17import markupsafe 

18from markdown.core import Markdown 

19from markdown.extensions import Extension 

20from markdown.inlinepatterns import REFERENCE_RE, ReferenceInlineProcessor 

21from markdown.treeprocessors import Treeprocessor 

22from markdown.util import HTML_PLACEHOLDER_RE, INLINE_PLACEHOLDER_RE 

23 

24if TYPE_CHECKING: 

25 from pathlib import Path 

26 

27 from markdown import Markdown 

28 

29 from mkdocs_autorefs.plugin import AutorefsPlugin 

30 

31try: 

32 from mkdocs.plugins import get_plugin_logger 

33 

34 log = get_plugin_logger(__name__) 

35except ImportError: 

36 # TODO: remove once support for MkDocs <1.5 is dropped 

37 log = logging.getLogger(f"mkdocs.plugins.{__name__}") # type: ignore[assignment] 

38 

39 

40# YORE: Bump 2: Remove block. 

41def __getattr__(name: str) -> Any: 

42 if name == "AutoRefInlineProcessor": 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 warnings.warn("AutoRefInlineProcessor was renamed AutorefsInlineProcessor", DeprecationWarning, stacklevel=2) 

44 return AutorefsInlineProcessor 

45 raise AttributeError(f"module 'mkdocs_autorefs.references' has no attribute {name}") 

46 

47 

48_ATTR_VALUE = r'"[^"<>]+"|[^"<> ]+' # Possibly with double quotes around 

49 

50# YORE: Bump 2: Remove block. 

51AUTO_REF_RE = re.compile( 

52 rf"<span data-(?P<kind>autorefs-(?:identifier|optional|optional-hover))=(?P<identifier>{_ATTR_VALUE})" 

53 rf"(?: class=(?P<class>{_ATTR_VALUE}))?(?P<attrs> [^<>]+)?>(?P<title>.*?)</span>", 

54 flags=re.DOTALL, 

55) 

56"""Deprecated. Use [`AUTOREF_RE`][mkdocs_autorefs.references.AUTOREF_RE] instead.""" 

57 

58AUTOREF_RE = re.compile(r"<autoref (?P<attrs>.*?)>(?P<title>.*?)</autoref>", flags=re.DOTALL) 

59"""The autoref HTML tag regular expression. 

60 

61A regular expression to match mkdocs-autorefs' special reference markers 

62in the [`on_post_page` hook][mkdocs_autorefs.plugin.AutorefsPlugin.on_post_page]. 

63""" 

64 

65 

66class AutorefsHookInterface(ABC): 

67 """An interface for hooking into how AutoRef handles inline references.""" 

68 

69 @dataclass 

70 class Context: 

71 """The context around an auto-reference.""" 

72 

73 domain: str 

74 role: str 

75 origin: str 

76 filepath: str | Path 

77 lineno: int 

78 

79 def as_dict(self) -> dict[str, str]: 

80 """Convert the context to a dictionary of HTML attributes.""" 

81 return { 

82 "domain": self.domain, 

83 "role": self.role, 

84 "origin": self.origin, 

85 "filepath": str(self.filepath), 

86 "lineno": str(self.lineno), 

87 } 

88 

89 @abstractmethod 

90 def expand_identifier(self, identifier: str) -> str: 

91 """Expand an identifier in a given context. 

92 

93 Parameters: 

94 identifier: The identifier to expand. 

95 

96 Returns: 

97 The expanded identifier. 

98 """ 

99 raise NotImplementedError 

100 

101 @abstractmethod 

102 def get_context(self) -> AutorefsHookInterface.Context: 

103 """Get the current context. 

104 

105 Returns: 

106 The current context. 

107 """ 

108 raise NotImplementedError 

109 

110 

111class AutorefsInlineProcessor(ReferenceInlineProcessor): 

112 """A Markdown extension to handle inline references.""" 

113 

114 name: str = "mkdocs-autorefs" 

115 hook: AutorefsHookInterface | None = None 

116 

117 def __init__(self, *args: Any, **kwargs: Any) -> None: # noqa: D107 

118 super().__init__(REFERENCE_RE, *args, **kwargs) 

119 

120 # Code based on 

121 # https://github.com/Python-Markdown/markdown/blob/8e7528fa5c98bf4652deb13206d6e6241d61630b/markdown/inlinepatterns.py#L780 

122 

123 def handleMatch(self, m: Match[str], data: str) -> tuple[Element | None, int | None, int | None]: # type: ignore[override] # noqa: N802 

124 """Handle an element that matched. 

125 

126 Arguments: 

127 m: The match object. 

128 data: The matched data. 

129 

130 Returns: 

131 A new element or a tuple. 

132 """ 

133 text, index, handled = self.getText(data, m.end(0)) 

134 if not handled: 

135 return None, None, None 

136 

137 identifier, end, handled = self.evalId(data, index, text) 

138 if not handled or identifier is None: 

139 return None, None, None 

140 

141 if re.search(r"[\x00-\x1f]", identifier): 

142 # Do nothing if the matched reference contains control characters (from 0 to 31 included). 

143 # Specifically `\x01` is used by Python-Markdown HTML stash when there's inline formatting, 

144 # but references with Markdown formatting are not possible anyway. 

145 return None, m.start(0), end 

146 

147 return self._make_tag(identifier, text), m.start(0), end 

148 

149 def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]: # noqa: N802 (parent's casing) 

150 """Evaluate the id portion of `[ref][id]`. 

151 

152 If `[ref][]` use `[ref]`. 

153 

154 Arguments: 

155 data: The data to evaluate. 

156 index: The starting position. 

157 text: The text to use when no identifier. 

158 

159 Returns: 

160 A tuple containing the identifier, its end position, and whether it matched. 

161 """ 

162 m = self.RE_LINK.match(data, pos=index) 

163 if not m: 

164 return None, index, False 

165 

166 identifier = m.group(1) 

167 if not identifier: 

168 identifier = text 

169 # Allow the entire content to be one placeholder, with the intent of catching things like [`Foo`][]. 

170 # It doesn't catch [*Foo*][] though, just due to the priority order. 

171 # https://github.com/Python-Markdown/markdown/blob/1858c1b601ead62ed49646ae0d99298f41b1a271/markdown/inlinepatterns.py#L78 

172 if match := INLINE_PLACEHOLDER_RE.fullmatch(identifier): 

173 stashed_nodes: dict[str, Element | str] = self.md.treeprocessors["inline"].stashed_nodes # type: ignore[attr-defined] 

174 el = stashed_nodes.get(match[1]) 

175 if isinstance(el, Element) and el.tag == "code": 175 ↛ 184line 175 didn't jump to line 184 because the condition on line 175 was always true

176 identifier = "".join(el.itertext()) 

177 # Special case: allow pymdownx.inlinehilite raw <code> snippets but strip them back to unhighlighted. 

178 if match := HTML_PLACEHOLDER_RE.fullmatch(identifier): 

179 stash_index = int(match.group(1)) 

180 html = self.md.htmlStash.rawHtmlBlocks[stash_index] 

181 identifier = markupsafe.Markup(html).striptags() 

182 self.md.htmlStash.rawHtmlBlocks[stash_index] = escape(identifier) 

183 

184 end = m.end(0) 

185 return identifier, end, True 

186 

187 def _make_tag(self, identifier: str, text: str) -> Element: 

188 """Create a tag that can be matched by `AUTO_REF_RE`. 

189 

190 Arguments: 

191 identifier: The identifier to use in the HTML property. 

192 text: The text to use in the HTML tag. 

193 

194 Returns: 

195 A new element. 

196 """ 

197 el = Element("autoref") 

198 if self.hook: 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true

199 identifier = self.hook.expand_identifier(identifier) 

200 el.attrib.update(self.hook.get_context().as_dict()) 

201 el.set("identifier", identifier) 

202 el.text = text 

203 return el 

204 

205 

206def relative_url(url_a: str, url_b: str) -> str: 

207 """Compute the relative path from URL A to URL B. 

208 

209 Arguments: 

210 url_a: URL A. 

211 url_b: URL B. 

212 

213 Returns: 

214 The relative URL to go from A to B. 

215 """ 

216 parts_a = url_a.split("/") 

217 url_b, anchor = url_b.split("#", 1) 

218 parts_b = url_b.split("/") 

219 

220 # remove common left parts 

221 while parts_a and parts_b and parts_a[0] == parts_b[0]: 

222 parts_a.pop(0) 

223 parts_b.pop(0) 

224 

225 # go up as many times as remaining a parts' depth 

226 levels = len(parts_a) - 1 

227 parts_relative = [".."] * levels + parts_b 

228 relative = "/".join(parts_relative) 

229 return f"{relative}#{anchor}" 

230 

231 

232# YORE: Bump 2: Remove block. 

233def _legacy_fix_ref( 

234 url_mapper: Callable[[str], str], 

235 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]], 

236) -> Callable: 

237 """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub). 

238 

239 In our context, we match Markdown references and replace them with HTML links. 

240 

241 When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer 

242 `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected 

243 and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning. 

244 

245 Arguments: 

246 url_mapper: A callable that gets an object's site URL by its identifier, 

247 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][]. 

248 unmapped: A list to store unmapped identifiers. 

249 

250 Returns: 

251 The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects) 

252 and returning the replacement strings. 

253 """ 

254 

255 def inner(match: Match) -> str: 

256 identifier = match["identifier"].strip('"') 

257 title = match["title"] 

258 kind = match["kind"] 

259 attrs = match["attrs"] or "" 

260 classes = (match["class"] or "").strip('"').split() 

261 

262 try: 

263 url = url_mapper(unescape(identifier)) 

264 except KeyError: 

265 if kind == "autorefs-optional": 

266 return title 

267 if kind == "autorefs-optional-hover": 

268 return f'<span title="{identifier}">{title}</span>' 

269 unmapped.append((identifier, None)) 

270 if title == identifier: 270 ↛ 271line 270 didn't jump to line 271 because the condition on line 270 was never true

271 return f"[{identifier}][]" 

272 return f"[{title}][{identifier}]" 

273 

274 warnings.warn( 

275 "autorefs `span` elements are deprecated in favor of `autoref` elements: " 

276 f'`<span data-autorefs-identifier="{identifier}">...</span>` becomes `<autoref identifer="{identifier}">...</autoref>`', 

277 DeprecationWarning, 

278 stacklevel=1, 

279 ) 

280 parsed = urlsplit(url) 

281 external = parsed.scheme or parsed.netloc 

282 classes = ["autorefs", "autorefs-external" if external else "autorefs-internal", *classes] 

283 class_attr = " ".join(classes) 

284 if kind == "autorefs-optional-hover": 

285 return f'<a class="{class_attr}" title="{identifier}" href="{escape(url)}"{attrs}>{title}</a>' 

286 return f'<a class="{class_attr}" href="{escape(url)}"{attrs}>{title}</a>' 

287 

288 return inner 

289 

290 

291class _AutorefsAttrs(dict): 

292 _handled_attrs: ClassVar[set[str]] = { 

293 "identifier", 

294 "optional", 

295 "hover", 

296 "class", 

297 "domain", 

298 "role", 

299 "origin", 

300 "filepath", 

301 "lineno", 

302 } 

303 

304 @property 

305 def context(self) -> AutorefsHookInterface.Context | None: 

306 try: 

307 return AutorefsHookInterface.Context( 

308 domain=self["domain"], 

309 role=self["role"], 

310 origin=self["origin"], 

311 filepath=self["filepath"], 

312 lineno=int(self["lineno"]), 

313 ) 

314 except KeyError: 

315 return None 

316 

317 @property 

318 def remaining(self) -> str: 

319 return " ".join(k if v is None else f'{k}="{v}"' for k, v in self.items() if k not in self._handled_attrs) 

320 

321 

322class _HTMLAttrsParser(HTMLParser): 

323 def __init__(self): 

324 super().__init__() 

325 self.attrs = {} 

326 

327 def parse(self, html: str) -> _AutorefsAttrs: 

328 self.attrs.clear() 

329 self.feed(html) 

330 return _AutorefsAttrs(self.attrs) 

331 

332 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: # noqa: ARG002 

333 self.attrs.update(attrs) 

334 

335 

336_html_attrs_parser = _HTMLAttrsParser() 

337 

338 

339def fix_ref( 

340 url_mapper: Callable[[str], str], 

341 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]], 

342) -> Callable: 

343 """Return a `repl` function for [`re.sub`](https://docs.python.org/3/library/re.html#re.sub). 

344 

345 In our context, we match Markdown references and replace them with HTML links. 

346 

347 When the matched reference's identifier was not mapped to an URL, we append the identifier to the outer 

348 `unmapped` list. It generally means the user is trying to cross-reference an object that was not collected 

349 and rendered, making it impossible to link to it. We catch this exception in the caller to issue a warning. 

350 

351 Arguments: 

352 url_mapper: A callable that gets an object's site URL by its identifier, 

353 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][]. 

354 unmapped: A list to store unmapped identifiers. 

355 

356 Returns: 

357 The actual function accepting a [`Match` object](https://docs.python.org/3/library/re.html#match-objects) 

358 and returning the replacement strings. 

359 """ 

360 

361 def inner(match: Match) -> str: 

362 title = match["title"] 

363 attrs = _html_attrs_parser.parse(f"<a {match['attrs']}>") 

364 identifier: str = attrs["identifier"] 

365 optional = "optional" in attrs 

366 hover = "hover" in attrs 

367 

368 try: 

369 url = url_mapper(unescape(identifier)) 

370 except KeyError: 

371 if optional: 

372 if hover: 

373 return f'<span title="{identifier}">{title}</span>' 

374 return title 

375 unmapped.append((identifier, attrs.context)) 

376 if title == identifier: 

377 return f"[{identifier}][]" 

378 return f"[{title}][{identifier}]" 

379 

380 parsed = urlsplit(url) 

381 external = parsed.scheme or parsed.netloc 

382 classes = (attrs.get("class") or "").strip().split() 

383 classes = ["autorefs", "autorefs-external" if external else "autorefs-internal", *classes] 

384 class_attr = " ".join(classes) 

385 if remaining := attrs.remaining: 

386 remaining = f" {remaining}" 

387 if optional and hover: 

388 return f'<a class="{class_attr}" title="{identifier}" href="{escape(url)}"{remaining}>{title}</a>' 

389 return f'<a class="{class_attr}" href="{escape(url)}"{remaining}>{title}</a>' 

390 

391 return inner 

392 

393 

394# YORE: Bump 2: Replace `, *, _legacy_refs: bool = True` with `` within line. 

395def fix_refs( 

396 html: str, 

397 url_mapper: Callable[[str], str], 

398 *, 

399 _legacy_refs: bool = True, 

400) -> tuple[str, list[tuple[str, AutorefsHookInterface.Context | None]]]: 

401 """Fix all references in the given HTML text. 

402 

403 Arguments: 

404 html: The text to fix. 

405 url_mapper: A callable that gets an object's site URL by its identifier, 

406 such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][]. 

407 

408 Returns: 

409 The fixed HTML, and a list of unmapped identifiers (string and optional context). 

410 """ 

411 unmapped: list[tuple[str, AutorefsHookInterface.Context | None]] = [] 

412 html = AUTOREF_RE.sub(fix_ref(url_mapper, unmapped), html) 

413 

414 # YORE: Bump 2: Remove block. 

415 if _legacy_refs: 415 ↛ 418line 415 didn't jump to line 418 because the condition on line 415 was always true

416 html = AUTO_REF_RE.sub(_legacy_fix_ref(url_mapper, unmapped), html) 

417 

418 return html, unmapped 

419 

420 

421class AnchorScannerTreeProcessor(Treeprocessor): 

422 """Tree processor to scan and register HTML anchors.""" 

423 

424 name: str = "mkdocs-autorefs-anchors-scanner" 

425 _htags: ClassVar[set[str]] = {"h1", "h2", "h3", "h4", "h5", "h6"} 

426 

427 def __init__(self, plugin: AutorefsPlugin, md: Markdown | None = None) -> None: 

428 """Initialize the tree processor. 

429 

430 Parameters: 

431 plugin: A reference to the autorefs plugin, to use its `register_anchor` method. 

432 """ 

433 super().__init__(md) 

434 self.plugin = plugin 

435 

436 def run(self, root: Element) -> None: # noqa: D102 

437 if self.plugin.current_page is not None: 437 ↛ exitline 437 didn't return from function 'run' because the condition on line 437 was always true

438 pending_anchors = _PendingAnchors(self.plugin, self.plugin.current_page) 

439 self._scan_anchors(root, pending_anchors) 

440 pending_anchors.flush() 

441 

442 def _scan_anchors(self, parent: Element, pending_anchors: _PendingAnchors) -> None: 

443 for el in parent: 

444 if el.tag == "a": 

445 # We found an anchor. Record its id if it has one. 

446 if anchor_id := el.get("id"): 446 ↛ 450line 446 didn't jump to line 450 because the condition on line 446 was always true

447 pending_anchors.append(anchor_id) 

448 # If the element has text or a link, it's not an alias. 

449 # Non-whitespace text after the element interrupts the chain, aliases can't apply. 

450 if el.text or el.get("href") or (el.tail and el.tail.strip()): 

451 pending_anchors.flush() 

452 

453 elif el.tag == "p": 

454 # A `p` tag is a no-op for our purposes, just recurse into it in the context 

455 # of the current collection of anchors. 

456 self._scan_anchors(el, pending_anchors) 

457 # Non-whitespace text after the element interrupts the chain, aliases can't apply. 

458 if el.tail and el.tail.strip(): 

459 pending_anchors.flush() 

460 

461 elif el.tag in self._htags: 

462 # If the element is a heading, that turns the pending anchors into aliases. 

463 pending_anchors.flush(el.get("id")) 

464 

465 else: 

466 # But if it's some other interruption, flush anchors anyway as non-aliases. 

467 pending_anchors.flush() 

468 # Recurse into sub-elements, in a *separate* context. 

469 self.run(el) 

470 

471 

472class _PendingAnchors: 

473 """A collection of HTML anchors that may or may not become aliased to an upcoming heading.""" 

474 

475 def __init__(self, plugin: AutorefsPlugin, current_page: str): 

476 self.plugin = plugin 

477 self.current_page = current_page 

478 self.anchors: list[str] = [] 

479 

480 def append(self, anchor: str) -> None: 

481 self.anchors.append(anchor) 

482 

483 def flush(self, alias_to: str | None = None) -> None: 

484 for anchor in self.anchors: 

485 self.plugin.register_anchor(self.current_page, anchor, alias_to) 

486 self.anchors.clear() 

487 

488 

489@lru_cache 

490def _log_enabling_markdown_anchors() -> None: 

491 log.debug("Enabling Markdown anchors feature") 

492 

493 

494class AutorefsExtension(Extension): 

495 """Markdown extension that transforms unresolved references into auto-references. 

496 

497 Auto-references are then resolved later by the MkDocs plugin. 

498 

499 This extension also scans Markdown anchors (`[](){#some-id}`) 

500 to register them with the MkDocs plugin. 

501 """ 

502 

503 def __init__( 

504 self, 

505 plugin: AutorefsPlugin | None = None, 

506 **kwargs: Any, 

507 ) -> None: 

508 """Initialize the Markdown extension. 

509 

510 Parameters: 

511 plugin: An optional reference to the autorefs plugin (to pass it to the anchor scanner tree processor). 

512 **kwargs: Keyword arguments passed to the [base constructor][markdown.extensions.Extension]. 

513 """ 

514 super().__init__(**kwargs) 

515 self.plugin = plugin 

516 

517 def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 (casing: parent method's name) 

518 """Register the extension. 

519 

520 Add an instance of our [`AutorefsInlineProcessor`][mkdocs_autorefs.references.AutorefsInlineProcessor] to the Markdown parser. 

521 Also optionally add an instance of our [`AnchorScannerTreeProcessor`][mkdocs_autorefs.references.AnchorScannerTreeProcessor] 

522 to the Markdown parser if a reference to the autorefs plugin was passed to this extension. 

523 

524 Arguments: 

525 md: A `markdown.Markdown` instance. 

526 """ 

527 md.inlinePatterns.register( 

528 AutorefsInlineProcessor(md), 

529 AutorefsInlineProcessor.name, 

530 priority=168, # Right after markdown.inlinepatterns.ReferenceInlineProcessor 

531 ) 

532 if self.plugin is not None and self.plugin.scan_toc and "attr_list" in md.treeprocessors: 

533 _log_enabling_markdown_anchors() 

534 md.treeprocessors.register( 

535 AnchorScannerTreeProcessor(self.plugin, md), 

536 AnchorScannerTreeProcessor.name, 

537 priority=0, 

538 )