From d55e7434fcce797f7fc7dfb4e8976602ef3ca9c4 Mon Sep 17 00:00:00 2001 From: Antony Lee Date: Sun, 10 Jan 2021 23:20:15 +0100 Subject: [PATCH] More speedup via mtime-base caching. Caching based on mtime is similar to the one done on importlib's FileFinder. Locally, on a large-ish environment, this speeds up repeated calls to `distribution("pip")` ~10x. --- importlib_metadata/__init__.py | 101 +++++++++++++++++++-------------- tests/test_api.py | 7 +++ tests/test_integration.py | 2 +- 3 files changed, 66 insertions(+), 44 deletions(-) diff --git a/importlib_metadata/__init__.py b/importlib_metadata/__init__.py index 079cc0f9..43b9ef31 100644 --- a/importlib_metadata/__init__.py +++ b/importlib_metadata/__init__.py @@ -461,6 +461,9 @@ class FastPath: def __init__(self, root): self.root = str(root) self.base = os.path.basename(self.root).lower() + self.last_mtime = -1 + self.infos = {} + self.eggs = {} def joinpath(self, child): return pathlib.Path(self.root, child) @@ -476,15 +479,46 @@ def zip_children(self): zip_path = zipp.Path(self.root) names = zip_path.root.namelist() self.joinpath = zip_path.joinpath - return dict.fromkeys(child.split(posixpath.sep, 1)[0] for child in names) - def search(self, name): - return ( - self.joinpath(child) - for child in self.children() - if name.matches(child, self.base) - ) + def update_cache(self): + root = self.root or "." + try: + mtime = os.stat(root).st_mtime + except OSError: + self.infos.clear() + self.eggs.clear() + return + if mtime == self.last_mtime: + return + self.infos.clear() + self.eggs.clear() + base_is_egg = self.base.endswith(".egg") + for child in self.children(): + low = child.lower() + if low.endswith((".dist-info", ".egg-info")): + # rpartition is faster than splitext and suitable for this purpose. + name = low.rpartition(".")[0].partition("-")[0] + normalized = Prepared.normalize(name) + self.infos.setdefault(normalized, []).append(child) + elif base_is_egg and low == "egg-info": + name = self.base.rpartition(".")[0].partition("-")[0] + legacy_normalized = Prepared.legacy_normalize(name) + self.eggs.setdefault(legacy_normalized, []).append(child) + self.last_mtime = mtime + + def search(self, prepared): + self.update_cache() + if prepared.name: + infos = self.infos.get(prepared.normalized, []) + yield from map(self.joinpath, infos) + eggs = self.eggs.get(prepared.legacy_normalized, []) + yield from map(self.joinpath, eggs) + else: + for infos in self.infos.values(): + yield from map(self.joinpath, infos) + for eggs in self.eggs.values(): + yield from map(self.joinpath, eggs) class Prepared: @@ -493,22 +527,14 @@ class Prepared: """ normalized = None - suffixes = 'dist-info', 'egg-info' - exact_matches = [''][:0] - egg_prefix = '' - versionless_egg_name = '' + legacy_normalized = None def __init__(self, name): self.name = name if name is None: return self.normalized = self.normalize(name) - self.exact_matches = [ - self.normalized + '.' + suffix for suffix in self.suffixes - ] - legacy_normalized = self.legacy_normalize(self.name) - self.egg_prefix = legacy_normalized + '-' - self.versionless_egg_name = legacy_normalized + '.egg' + self.legacy_normalized = self.legacy_normalize(name) @staticmethod def normalize(name): @@ -525,27 +551,6 @@ def legacy_normalize(name): """ return name.lower().replace('-', '_') - def matches(self, cand, base): - low = cand.lower() - # rpartition is faster than splitext and suitable for this purpose. - pre, _, ext = low.rpartition('.') - name, _, rest = pre.partition('-') - return ( - low in self.exact_matches - or ext in self.suffixes - and (not self.normalized or name.replace('.', '_') == self.normalized) - # legacy case: - or self.is_egg(base) - and low == 'egg-info' - ) - - def is_egg(self, base): - return ( - base == self.versionless_egg_name - or base.startswith(self.egg_prefix) - and base.endswith('.egg') - ) - @install class MetadataPathFinder(NullFinder, DistributionFinder): @@ -555,6 +560,10 @@ class MetadataPathFinder(NullFinder, DistributionFinder): of Python that do not have a PathFinder find_distributions(). """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._cache = {} + def find_distributions(self, context=DistributionFinder.Context()): """ Find distributions. @@ -567,13 +576,19 @@ def find_distributions(self, context=DistributionFinder.Context()): found = self._search_paths(context.name, context.path) return map(PathDistribution, found) - @classmethod - def _search_paths(cls, name, paths): + def _search_paths(self, name, paths): """Find metadata directories in paths heuristically.""" prepared = Prepared(name) - return itertools.chain.from_iterable( - path.search(prepared) for path in map(FastPath, paths) - ) + for path in paths: + try: + fastpath = self._cache[path] + except KeyError: + fastpath = FastPath(path) + self._cache[path] = fastpath + yield from fastpath.search(prepared) + + def invalidate_caches(self): + self._cache.clear() class PathDistribution(Distribution): diff --git a/tests/test_api.py b/tests/test_api.py index a386551f..81589a81 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,6 +1,7 @@ import re import textwrap import unittest +import importlib from . import fixtures from importlib_metadata import ( @@ -196,3 +197,9 @@ def test_distribution_at_str(self): dist_info_path = self.site_dir / 'distinfo_pkg-1.0.0.dist-info' dist = Distribution.at(str(dist_info_path)) assert dist.version == '1.0.0' + + +class InvalidateCache(unittest.TestCase): + def test_invalidate_cache(self): + # No externally observable behavior, but ensures test coverage... + importlib.invalidate_caches() diff --git a/tests/test_integration.py b/tests/test_integration.py index 11835135..4d33303b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -57,5 +57,5 @@ def test_search_dist_dirs(self): distribution metadata dirs. Protect it for PyPA use-cases (only). Ref python/importlib_metadata#111. """ - res = MetadataPathFinder._search_paths('any-name', []) + res = MetadataPathFinder()._search_paths('any-name', []) assert list(res) == []