Coverage for src/_griffe/finder.py: 94.59%

250 statements  

« prev     ^ index     » next       coverage.py v7.6.2, created at 2024-10-12 01:34 +0200

1# This module contains the code allowing to find modules. 

2# 

3# Note: It might be possible to replace a good part of this module's logic 

4# with utilities from `importlib` (however the util in question is private): 

5# 

6# ```pycon 

7# >>> from importlib.util import _find_spec 

8# >>> _find_spec("griffe.agents", _find_spec("griffe", None).submodule_search_locations) 

9# ModuleSpec( 

10# name='griffe.agents', 

11# loader=<_frozen_importlib_external.SourceFileLoader object at 0x7fa5f34e8110>, 

12# origin='/media/data/dev/griffe/src/griffe/agents/__init__.py', 

13# submodule_search_locations=['/media/data/dev/griffe/src/griffe/agents'], 

14# ) 

15# ``` 

16 

17from __future__ import annotations 

18 

19import ast 

20import os 

21import re 

22import sys 

23from collections import defaultdict 

24from contextlib import suppress 

25from dataclasses import dataclass 

26from itertools import chain 

27from pathlib import Path 

28from typing import TYPE_CHECKING, ClassVar 

29 

30from _griffe.exceptions import UnhandledEditableModuleError 

31from _griffe.logger import logger 

32 

33if TYPE_CHECKING: 

34 from collections.abc import Iterator, Sequence 

35 from re import Pattern 

36 

37 from _griffe.models import Module 

38 

39 

40_editable_editables_patterns = [re.compile(pat) for pat in (r"^__editables_\w+\.py$", r"^_editable_impl_\w+\.py$")] 

41_editable_setuptools_patterns = [re.compile(pat) for pat in (r"^__editable__\w+\.py$",)] 

42_editable_scikit_build_core_patterns = [re.compile(pat) for pat in (r"^_\w+_editable.py$",)] 

43_editable_meson_python_patterns = [re.compile(pat) for pat in (r"^_\w+_editable_loader.py$",)] 

44 

45NamePartsType = tuple[str, ...] 

46"""Type alias for the parts of a module name.""" 

47NamePartsAndPathType = tuple[NamePartsType, Path] 

48"""Type alias for the parts of a module name and its path.""" 

49 

50 

51def _match_pattern(string: str, patterns: Sequence[Pattern]) -> bool: 

52 return any(pattern.match(string) for pattern in patterns) 

53 

54 

55@dataclass 

56class Package: 

57 """This class is a simple placeholder used during the process of finding packages. 

58 

59 Parameters: 

60 name: The package name. 

61 path: The package path(s). 

62 stubs: An optional path to the related stubs file (.pyi). 

63 """ 

64 

65 name: str 

66 """Package name.""" 

67 path: Path 

68 """Package folder path.""" 

69 stubs: Path | None = None 

70 """Package stubs file.""" 

71 

72 

73@dataclass 

74class NamespacePackage: 

75 """This class is a simple placeholder used during the process of finding packages. 

76 

77 Parameters: 

78 name: The package name. 

79 path: The package paths. 

80 """ 

81 

82 name: str 

83 """Namespace package name.""" 

84 path: list[Path] 

85 """Namespace package folder paths.""" 

86 

87 

88class ModuleFinder: 

89 """The Griffe finder, allowing to find modules on the file system. 

90 

91 The module finder is generally not used directly. 

92 Each [`GriffeLoader`][griffe.GriffeLoader] instance creates its own module finder instance. 

93 The finder can be configured when instantiating the loader 

94 thanks to the [loader][griffe.GriffeLoader]'s `search_paths` parameter. 

95 """ 

96 

97 accepted_py_module_extensions: ClassVar[list[str]] = [".py", ".pyc", ".pyo", ".pyd", ".pyi", ".so"] 

98 """List of extensions supported by the finder.""" 

99 extensions_set: ClassVar[set[str]] = set(accepted_py_module_extensions) 

100 """Set of extensions supported by the finder.""" 

101 

102 def __init__(self, search_paths: Sequence[str | Path] | None = None) -> None: 

103 """Initialize the finder. 

104 

105 Parameters: 

106 search_paths: Optional paths to search into. 

107 """ 

108 self._paths_contents: dict[Path, list[Path]] = {} 

109 self.search_paths: list[Path] = [] 

110 """The finder search paths.""" 

111 

112 # Optimization: pre-compute Paths to relieve CPU when joining paths. 

113 for path in search_paths or sys.path: 

114 self.append_search_path(Path(path)) 

115 

116 self._always_scan_for: dict[str, list[Path]] = defaultdict(list) 

117 self._extend_from_pth_files() 

118 

119 def append_search_path(self, path: Path) -> None: 

120 """Append a search path. 

121 

122 The path will be resolved (absolute, normalized). 

123 The path won't be appended if it is already in the search paths list. 

124 

125 Parameters: 

126 path: The path to append. 

127 """ 

128 path = path.resolve() 

129 if path not in self.search_paths: 

130 self.search_paths.append(path) 

131 

132 def insert_search_path(self, position: int, path: Path) -> None: 

133 """Insert a search path at the given position. 

134 

135 The path will be resolved (absolute, normalized). 

136 The path won't be inserted if it is already in the search paths list. 

137 

138 Parameters: 

139 position: The insert position in the list. 

140 path: The path to insert. 

141 """ 

142 path = path.resolve() 

143 if path not in self.search_paths: 143 ↛ exitline 143 didn't return from function 'insert_search_path' because the condition on line 143 was always true

144 self.search_paths.insert(position, path) 

145 

146 def find_spec( 

147 self, 

148 module: str | Path, 

149 *, 

150 try_relative_path: bool = True, 

151 find_stubs_package: bool = False, 

152 ) -> tuple[str, Package | NamespacePackage]: 

153 """Find the top-level parent module of a module. 

154 

155 If a Path is passed, only try to find the module as a file path. 

156 If a string is passed, first try to find the module as a file path, 

157 then look into the search paths. 

158 

159 Parameters: 

160 module: The module name or path. 

161 try_relative_path: Whether to try finding the module as a relative path, 

162 when the given module is not already a path. 

163 find_stubs_package: Whether to search for stubs-only package. 

164 If both the package and its stubs are found, they'll be merged together. 

165 If only the stubs are found, they'll be used as the package itself. 

166 

167 Raises: 

168 FileNotFoundError: When a Path was passed and the module could not be found: 

169 

170 - the directory has no `__init__.py` file in it 

171 - the path does not exist 

172 

173 ModuleNotFoundError: When a string was passed and the module could not be found: 

174 

175 - no `module/__init__.py` 

176 - no `module.py` 

177 - no `module.pth` 

178 - no `module` directory (namespace packages) 

179 - or unsupported .pth file 

180 

181 Returns: 

182 The name of the module, and an instance representing its (namespace) package. 

183 """ 

184 module_path: Path | list[Path] 

185 if isinstance(module, Path): 

186 module_name, module_path = self._module_name_path(module) 

187 top_module_name = self._top_module_name(module_path) 

188 elif try_relative_path: 

189 try: 

190 module_name, module_path = self._module_name_path(Path(module)) 

191 except FileNotFoundError: 

192 module_name = module 

193 top_module_name = module.split(".", 1)[0] 

194 else: 

195 top_module_name = self._top_module_name(module_path) 

196 else: 

197 module_name = module 

198 top_module_name = module.split(".", 1)[0] 

199 

200 # Only search for actual package, let exceptions bubble up. 

201 if not find_stubs_package: 

202 return module_name, self.find_package(top_module_name) 

203 

204 # Search for both package and stubs-only package. 

205 try: 

206 package = self.find_package(top_module_name) 

207 except ModuleNotFoundError: 

208 package = None 

209 try: 

210 stubs = self.find_package(top_module_name + "-stubs") 

211 except ModuleNotFoundError: 

212 stubs = None 

213 

214 # None found, raise error. 

215 if package is None and stubs is None: 

216 raise ModuleNotFoundError(top_module_name) 

217 

218 # Both found, assemble them to be merged later. 

219 if package and stubs: 

220 if isinstance(package, Package) and isinstance(stubs, Package): 

221 package.stubs = stubs.path 

222 elif isinstance(package, NamespacePackage) and isinstance(stubs, NamespacePackage): 222 ↛ 224line 222 didn't jump to line 224 because the condition on line 222 was always true

223 package.path += stubs.path 

224 return module_name, package 

225 

226 # Return either one. 

227 return module_name, package or stubs # type: ignore[return-value] 

228 

229 def find_package(self, module_name: str) -> Package | NamespacePackage: 

230 """Find a package or namespace package. 

231 

232 Parameters: 

233 module_name: The module name. 

234 

235 Raises: 

236 ModuleNotFoundError: When the module cannot be found. 

237 

238 Returns: 

239 A package or namespace package wrapper. 

240 """ 

241 filepaths = [ 

242 Path(module_name), 

243 # TODO: Handle .py[cod] and .so files? 

244 # This would be needed for package that are composed 

245 # solely of a file with such an extension. 

246 Path(f"{module_name}.py"), 

247 ] 

248 

249 real_module_name = module_name 

250 if real_module_name.endswith("-stubs"): 

251 real_module_name = real_module_name[:-6] 

252 namespace_dirs = [] 

253 for path in self.search_paths: 

254 path_contents = self._contents(path) 

255 if path_contents: 

256 for choice in filepaths: 

257 abs_path = path / choice 

258 if abs_path in path_contents: 

259 if abs_path.suffix: 

260 stubs = abs_path.with_suffix(".pyi") 

261 return Package(real_module_name, abs_path, stubs if stubs.exists() else None) 

262 init_module = abs_path / "__init__.py" 

263 if init_module.exists() and not _is_pkg_style_namespace(init_module): 

264 stubs = init_module.with_suffix(".pyi") 

265 return Package(real_module_name, init_module, stubs if stubs.exists() else None) 

266 init_module = abs_path / "__init__.pyi" 

267 if init_module.exists(): 

268 # Stubs package 

269 return Package(real_module_name, init_module, None) 

270 namespace_dirs.append(abs_path) 

271 

272 if namespace_dirs: 

273 return NamespacePackage(module_name, namespace_dirs) 

274 

275 raise ModuleNotFoundError(module_name) 

276 

277 def iter_submodules( 

278 self, 

279 path: Path | list[Path], 

280 seen: set | None = None, 

281 ) -> Iterator[NamePartsAndPathType]: 

282 """Iterate on a module's submodules, if any. 

283 

284 Parameters: 

285 path: The module path. 

286 seen: If not none, this set is used to skip some files. 

287 The goal is to replicate the behavior of Python by 

288 only using the first packages (with `__init__` modules) 

289 of the same name found in different namespace packages. 

290 As soon as we find an `__init__` module, we add its parent 

291 path to the `seen` set, which will be reused when scanning 

292 the next namespace packages. 

293 

294 Yields: 

295 name_parts (tuple[str, ...]): The parts of a submodule name. 

296 filepath (Path): A submodule filepath. 

297 """ 

298 if isinstance(path, list): 

299 # We never enter this condition again in recursive calls, 

300 # so we just have to set `seen` once regardless of its value. 

301 seen = set() 

302 for path_elem in path: 

303 yield from self.iter_submodules(path_elem, seen) 

304 return 

305 

306 if path.stem == "__init__": 

307 path = path.parent 

308 # Optimization: just check if the file name ends with .py[icod]/.so 

309 # (to distinguish it from a directory), not if it's an actual file. 

310 elif path.suffix in self.extensions_set: 

311 return 

312 

313 # `seen` is only set when we scan a list of paths (namespace package). 

314 # `skip` is used to prevent yielding modules 

315 # of a regular subpackage that we already yielded 

316 # from another part of the namespace. 

317 skip = set(seen or ()) 

318 

319 for subpath in self._filter_py_modules(path): 

320 rel_subpath = subpath.relative_to(path) 

321 if rel_subpath.parent in skip: 

322 logger.debug("Skip %s, another module took precedence", subpath) 

323 continue 

324 py_file = rel_subpath.suffix == ".py" 

325 stem = rel_subpath.stem 

326 if not py_file: 

327 # .py[cod] and .so files look like `name.cpython-38-x86_64-linux-gnu.ext` 

328 stem = stem.split(".", 1)[0] 

329 if stem == "__init__": 

330 # Optimization: since it's a relative path, if it has only one part 

331 # and is named __init__, it means it's the starting path 

332 # (no need to compare it against starting path). 

333 if len(rel_subpath.parts) == 1: 

334 continue 

335 yield rel_subpath.parts[:-1], subpath 

336 if seen is not None: 

337 seen.add(rel_subpath.parent) 

338 elif py_file: 

339 yield rel_subpath.with_suffix("").parts, subpath 

340 else: 

341 yield rel_subpath.with_name(stem).parts, subpath 

342 

343 def submodules(self, module: Module) -> list[NamePartsAndPathType]: 

344 """Return the list of a module's submodules. 

345 

346 Parameters: 

347 module: The parent module. 

348 

349 Returns: 

350 A list of tuples containing the parts of the submodule name and its path. 

351 """ 

352 return sorted( 

353 chain( 

354 self.iter_submodules(module.filepath), 

355 self.iter_submodules(self._always_scan_for[module.name]), 

356 ), 

357 key=_module_depth, 

358 ) 

359 

360 def _module_name_path(self, path: Path) -> tuple[str, Path]: 

361 # Always return absolute paths to avoid working-directory-dependent issues. 

362 path = path.absolute() 

363 if path.is_dir(): 

364 for ext in self.accepted_py_module_extensions: 

365 module_path = path / f"__init__{ext}" 

366 if module_path.exists(): 366 ↛ 367line 366 didn't jump to line 367 because the condition on line 366 was never true

367 return path.name, module_path 

368 return path.name, path 

369 if path.exists(): 

370 if path.stem == "__init__": 370 ↛ 371line 370 didn't jump to line 371 because the condition on line 370 was never true

371 return path.parent.name, path 

372 return path.stem, path 

373 raise FileNotFoundError 

374 

375 def _contents(self, path: Path) -> list[Path]: 

376 if path not in self._paths_contents: 

377 try: 

378 self._paths_contents[path] = list(path.iterdir()) 

379 except (FileNotFoundError, NotADirectoryError): 

380 self._paths_contents[path] = [] 

381 return self._paths_contents[path] 

382 

383 def _append_search_path(self, path: Path) -> None: 

384 if path not in self.search_paths: 

385 self.search_paths.append(path) 

386 

387 def _extend_from_pth_files(self) -> None: 

388 for path in self.search_paths: 

389 for item in self._contents(path): 

390 if item.suffix == ".pth": 

391 for directory in _handle_pth_file(item): 

392 if scan := directory.always_scan_for: 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true

393 self._always_scan_for[scan].append(directory.path.joinpath(scan)) 

394 self.append_search_path(directory.path) 

395 

396 def _filter_py_modules(self, path: Path) -> Iterator[Path]: 

397 for root, dirs, files in os.walk(path, topdown=True): 

398 # Optimization: modify dirs in-place to exclude `__pycache__` directories. 

399 dirs[:] = [dir for dir in dirs if dir != "__pycache__"] 

400 for relfile in files: 

401 if os.path.splitext(relfile)[1] in self.extensions_set: # noqa: PTH122 

402 yield Path(root, relfile) 

403 

404 def _top_module_name(self, path: Path) -> str: 

405 # First find if a parent is in search paths. 

406 parent_path = path if path.is_dir() else path.parent 

407 # Always resolve parent path to compare for relativeness against resolved search paths. 

408 parent_path = parent_path.resolve() 

409 for search_path in self.search_paths: 

410 with suppress(ValueError, IndexError): 

411 rel_path = parent_path.relative_to(search_path.resolve()) 

412 return rel_path.parts[0] 

413 # If not, get the highest directory with an `__init__` module, 

414 # add its parent to search paths and return it. 

415 while parent_path.parent != parent_path and (parent_path.parent / "__init__.py").exists(): 415 ↛ 416line 415 didn't jump to line 416 because the condition on line 415 was never true

416 parent_path = parent_path.parent 

417 self.insert_search_path(0, parent_path.parent) 

418 return parent_path.name 

419 

420 

421_re_pkgresources = re.compile(r"(?:__import__\([\"']pkg_resources[\"']\).declare_namespace\(__name__\))") 

422_re_pkgutil = re.compile(r"(?:__path__ = __import__\([\"']pkgutil[\"']\).extend_path\(__path__, __name__\))") 

423_re_import_line = re.compile(r"^import[ \t]+\w+$") 

424 

425 

426# TODO: For more robustness, we should load and minify the AST 

427# to search for particular call statements. 

428def _is_pkg_style_namespace(init_module: Path) -> bool: 

429 code = init_module.read_text(encoding="utf8") 

430 return bool(_re_pkgresources.search(code) or _re_pkgutil.search(code)) 

431 

432 

433def _module_depth(name_parts_and_path: NamePartsAndPathType) -> int: 

434 return len(name_parts_and_path[0]) 

435 

436 

437@dataclass 

438class _SP: 

439 path: Path 

440 always_scan_for: str = "" 

441 

442 

443def _handle_pth_file(path: Path) -> list[_SP]: 

444 # Support for .pth files pointing to directories. 

445 # From https://docs.python.org/3/library/site.html: 

446 # A path configuration file is a file whose name has the form name.pth 

447 # and exists in one of the four directories mentioned above; 

448 # its contents are additional items (one per line) to be added to sys.path. 

449 # Non-existing items are never added to sys.path, 

450 # and no check is made that the item refers to a directory rather than a file. 

451 # No item is added to sys.path more than once. 

452 # Blank lines and lines beginning with # are skipped. 

453 # Lines starting with import (followed by space or tab) are executed. 

454 directories: list[_SP] = [] 

455 try: 

456 # It turns out PyTorch recommends its users to use `.pth` as the extension 

457 # when saving models on the disk. These model files are not encoded in UTF8. 

458 # If UTF8 decoding fails, we skip the .pth file. 

459 text = path.read_text(encoding="utf8") 

460 except UnicodeDecodeError: 

461 return directories 

462 for line in text.strip().replace(";", "\n").splitlines(keepends=False): 

463 line = line.strip() # noqa: PLW2901 

464 if _re_import_line.match(line): 

465 editable_module = path.parent / f"{line[len('import'):].lstrip()}.py" 

466 with suppress(UnhandledEditableModuleError): 

467 return _handle_editable_module(editable_module) 

468 if line and not line.startswith("#") and os.path.exists(line): # noqa: PTH110 

469 directories.append(_SP(Path(line))) 

470 return directories 

471 

472 

473def _handle_editable_module(path: Path) -> list[_SP]: 

474 if _match_pattern(path.name, (*_editable_editables_patterns, *_editable_scikit_build_core_patterns)): 

475 # Support for how 'editables' write these files: 

476 # example line: `F.map_module('griffe', '/media/data/dev/griffe/src/griffe/__init__.py')`. 

477 # And how 'scikit-build-core' writes these files: 

478 # example line: `install({'griffe': '/media/data/dev/griffe/src/griffe/__init__.py'}, {'cmake_example': ...}, None, False, True)`. 

479 try: 

480 editable_lines = path.read_text(encoding="utf8").strip().splitlines(keepends=False) 

481 except FileNotFoundError as error: 

482 raise UnhandledEditableModuleError(path) from error 

483 new_path = Path(editable_lines[-1].split("'")[3]) 

484 if new_path.name.startswith("__init__"): 

485 return [_SP(new_path.parent.parent)] 

486 return [_SP(new_path)] 

487 if _match_pattern(path.name, _editable_setuptools_patterns): 

488 # Support for how 'setuptools' writes these files: 

489 # example line: `MAPPING = {'griffe': '/media/data/dev/griffe/src/griffe', 'briffe': '/media/data/dev/griffe/src/briffe'}`. 

490 # with annotation: `MAPPING: dict[str, str] = {...}`. 

491 parsed_module = ast.parse(path.read_text()) 

492 for node in parsed_module.body: 492 ↛ 501line 492 didn't jump to line 501 because the loop on line 492 didn't complete

493 if isinstance(node, ast.Assign): 

494 target = node.targets[0] 

495 elif isinstance(node, ast.AnnAssign): 

496 target = node.target 

497 else: 

498 continue 

499 if isinstance(target, ast.Name) and target.id == "MAPPING" and isinstance(node.value, ast.Dict): # type: ignore[attr-defined] 

500 return [_SP(Path(cst.value).parent) for cst in node.value.values if isinstance(cst, ast.Constant)] # type: ignore[attr-defined] 

501 if _match_pattern(path.name, _editable_meson_python_patterns): 

502 # Support for how 'meson-python' writes these files: 

503 # example line: `install({'package', 'module1'}, '/media/data/dev/griffe/build/cp311', ["path"], False)`. 

504 # Compiled modules then found in the cp311 folder, under src/package. 

505 parsed_module = ast.parse(path.read_text()) 

506 for node in parsed_module.body: 506 ↛ 518line 506 didn't jump to line 518 because the loop on line 506 didn't complete

507 if ( 

508 isinstance(node, ast.Expr) 

509 and isinstance(node.value, ast.Call) 

510 and isinstance(node.value.func, ast.Name) 

511 and node.value.func.id == "install" 

512 and isinstance(node.value.args[1], ast.Constant) 

513 ): 

514 build_path = Path(node.value.args[1].value, "src") 

515 # NOTE: What if there are multiple packages? 

516 pkg_name = next(build_path.iterdir()).name 

517 return [_SP(build_path, always_scan_for=pkg_name)] 

518 raise UnhandledEditableModuleError(path)