Skip to content

Commit

Permalink
More speedup via mtime-base caching.
Browse files Browse the repository at this point in the history
Caching based on mtime is similar to the one done on importlib's
FileFinder.

Locally, on a large-ish environment, this speeds up repeated calls to
`distribution("pip")` ~10x.
  • Loading branch information
anntzer committed Jan 11, 2021
1 parent c066e68 commit d55e743
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 44 deletions.
101 changes: 58 additions & 43 deletions importlib_metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,9 @@ class FastPath:
def __init__(self, root):
self.root = str(root)
self.base = os.path.basename(self.root).lower()
self.last_mtime = -1
self.infos = {}
self.eggs = {}

def joinpath(self, child):
return pathlib.Path(self.root, child)
Expand All @@ -476,15 +479,46 @@ def zip_children(self):
zip_path = zipp.Path(self.root)
names = zip_path.root.namelist()
self.joinpath = zip_path.joinpath

return dict.fromkeys(child.split(posixpath.sep, 1)[0] for child in names)

def search(self, name):
return (
self.joinpath(child)
for child in self.children()
if name.matches(child, self.base)
)
def update_cache(self):
root = self.root or "."
try:
mtime = os.stat(root).st_mtime
except OSError:
self.infos.clear()
self.eggs.clear()
return
if mtime == self.last_mtime:
return
self.infos.clear()
self.eggs.clear()
base_is_egg = self.base.endswith(".egg")
for child in self.children():
low = child.lower()
if low.endswith((".dist-info", ".egg-info")):
# rpartition is faster than splitext and suitable for this purpose.
name = low.rpartition(".")[0].partition("-")[0]
normalized = Prepared.normalize(name)
self.infos.setdefault(normalized, []).append(child)
elif base_is_egg and low == "egg-info":
name = self.base.rpartition(".")[0].partition("-")[0]
legacy_normalized = Prepared.legacy_normalize(name)
self.eggs.setdefault(legacy_normalized, []).append(child)
self.last_mtime = mtime

def search(self, prepared):
self.update_cache()
if prepared.name:
infos = self.infos.get(prepared.normalized, [])
yield from map(self.joinpath, infos)
eggs = self.eggs.get(prepared.legacy_normalized, [])
yield from map(self.joinpath, eggs)
else:
for infos in self.infos.values():
yield from map(self.joinpath, infos)
for eggs in self.eggs.values():
yield from map(self.joinpath, eggs)


class Prepared:
Expand All @@ -493,22 +527,14 @@ class Prepared:
"""

normalized = None
suffixes = 'dist-info', 'egg-info'
exact_matches = [''][:0]
egg_prefix = ''
versionless_egg_name = ''
legacy_normalized = None

def __init__(self, name):
self.name = name
if name is None:
return
self.normalized = self.normalize(name)
self.exact_matches = [
self.normalized + '.' + suffix for suffix in self.suffixes
]
legacy_normalized = self.legacy_normalize(self.name)
self.egg_prefix = legacy_normalized + '-'
self.versionless_egg_name = legacy_normalized + '.egg'
self.legacy_normalized = self.legacy_normalize(name)

@staticmethod
def normalize(name):
Expand All @@ -525,27 +551,6 @@ def legacy_normalize(name):
"""
return name.lower().replace('-', '_')

def matches(self, cand, base):
low = cand.lower()
# rpartition is faster than splitext and suitable for this purpose.
pre, _, ext = low.rpartition('.')
name, _, rest = pre.partition('-')
return (
low in self.exact_matches
or ext in self.suffixes
and (not self.normalized or name.replace('.', '_') == self.normalized)
# legacy case:
or self.is_egg(base)
and low == 'egg-info'
)

def is_egg(self, base):
return (
base == self.versionless_egg_name
or base.startswith(self.egg_prefix)
and base.endswith('.egg')
)


@install
class MetadataPathFinder(NullFinder, DistributionFinder):
Expand All @@ -555,6 +560,10 @@ class MetadataPathFinder(NullFinder, DistributionFinder):
of Python that do not have a PathFinder find_distributions().
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._cache = {}

def find_distributions(self, context=DistributionFinder.Context()):
"""
Find distributions.
Expand All @@ -567,13 +576,19 @@ def find_distributions(self, context=DistributionFinder.Context()):
found = self._search_paths(context.name, context.path)
return map(PathDistribution, found)

@classmethod
def _search_paths(cls, name, paths):
def _search_paths(self, name, paths):
"""Find metadata directories in paths heuristically."""
prepared = Prepared(name)
return itertools.chain.from_iterable(
path.search(prepared) for path in map(FastPath, paths)
)
for path in paths:
try:
fastpath = self._cache[path]
except KeyError:
fastpath = FastPath(path)
self._cache[path] = fastpath
yield from fastpath.search(prepared)

def invalidate_caches(self):
self._cache.clear()


class PathDistribution(Distribution):
Expand Down
7 changes: 7 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
import textwrap
import unittest
import importlib

from . import fixtures
from importlib_metadata import (
Expand Down Expand Up @@ -196,3 +197,9 @@ def test_distribution_at_str(self):
dist_info_path = self.site_dir / 'distinfo_pkg-1.0.0.dist-info'
dist = Distribution.at(str(dist_info_path))
assert dist.version == '1.0.0'


class InvalidateCache(unittest.TestCase):
def test_invalidate_cache(self):
# No externally observable behavior, but ensures test coverage...
importlib.invalidate_caches()
2 changes: 1 addition & 1 deletion tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,5 @@ def test_search_dist_dirs(self):
distribution metadata dirs. Protect it for PyPA
use-cases (only). Ref python/importlib_metadata#111.
"""
res = MetadataPathFinder._search_paths('any-name', [])
res = MetadataPathFinder()._search_paths('any-name', [])
assert list(res) == []

0 comments on commit d55e743

Please sign in to comment.