Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More speedup via mtime-base caching. #274

Merged
merged 2 commits into from
Mar 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 51 additions & 39 deletions importlib_metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,9 +600,15 @@ class FastPath:
children.
"""

def __init__(self, root):
@functools.lru_cache() # type: ignore
def __new__(cls, root):
self = object().__new__(cls)
self.root = str(root)
self.base = os.path.basename(self.root).lower()
self.last_mtime = -1
self.infos = {}
self.eggs = {}
return self

def joinpath(self, child):
return pathlib.Path(self.root, child)
Expand All @@ -618,15 +624,47 @@ def zip_children(self):
zip_path = zipp.Path(self.root)
names = zip_path.root.namelist()
self.joinpath = zip_path.joinpath

return dict.fromkeys(child.split(posixpath.sep, 1)[0] for child in names)

def search(self, name):
return (
self.joinpath(child)
for child in self.children()
if name.matches(child, self.base)
)
def update_cache(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not too happy that a lot of essential behavior ("matches", "is_egg", "prepared computations") has been inlined into an "update_cache" function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's because there's no more notion of "prepared computations" (because we just parse everything and store the normalized results into the cache). "Matches" is basically just a dict lookup now (in search()).

I guess at best I could have a def is_info(self, low): return low.endswith(...) and def is_egg(self, low): return self.base_is_egg and low == "egg-info". I don't think adding these levels of indirection help legibility, but that's up to you.

I can't think of further factoring, though.

root = self.root or "."
try:
mtime = os.stat(root).st_mtime
except OSError:
self.infos.clear()
self.eggs.clear()
self.last_mtime = -1
return
if mtime == self.last_mtime:
return
self.infos.clear()
self.eggs.clear()
base_is_egg = self.base.endswith(".egg")
for child in self.children():
low = child.lower()
if low.endswith((".dist-info", ".egg-info")):
# rpartition is faster than splitext and suitable for this purpose.
name = low.rpartition(".")[0].partition("-")[0]
normalized = Prepared.normalize(name)
self.infos.setdefault(normalized, []).append(child)
elif base_is_egg and low == "egg-info":
name = self.base.rpartition(".")[0].partition("-")[0]
legacy_normalized = Prepared.legacy_normalize(name)
self.eggs.setdefault(legacy_normalized, []).append(child)
self.last_mtime = mtime

def search(self, prepared):
self.update_cache()
if prepared.name:
infos = self.infos.get(prepared.normalized, [])
yield from map(self.joinpath, infos)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm seeing a lot of repetition and branching. I'd want to see this unified.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can write something like

    def search(self, prepared):
        self.update_cache()
        infos_and_eggs = itertools.chain(
            self.infos.get(prepared.normalized, []) if prepared.name else
            itertools.chain.from_iterable(self.infos.values()),
            self.eggs.get(prepared.legacy_normalized, []) if prepared.name else
            itertools.chain.from_iterable(self.eggs.values()))
        yield from map(self.joinpath, infos_and_eggs)

(or lift the if prepared.name out) but that seems less legible IMHO...

eggs = self.eggs.get(prepared.legacy_normalized, [])
yield from map(self.joinpath, eggs)
else:
for infos in self.infos.values():
yield from map(self.joinpath, infos)
for eggs in self.eggs.values():
yield from map(self.joinpath, eggs)


class Prepared:
jaraco marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -635,22 +673,14 @@ class Prepared:
"""

normalized = None
suffixes = 'dist-info', 'egg-info'
exact_matches = [''][:0]
egg_prefix = ''
versionless_egg_name = ''
legacy_normalized = None

def __init__(self, name):
self.name = name
if name is None:
return
self.normalized = self.normalize(name)
self.exact_matches = [
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where did the behavior for exact matches go?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

low.rpartition(".")[0].partition("-")[0] handles both exact and inexact matches (for exact (versionless) matches, .partition("-")[0] will be a noop).

self.normalized + '.' + suffix for suffix in self.suffixes
]
legacy_normalized = self.legacy_normalize(self.name)
self.egg_prefix = legacy_normalized + '-'
self.versionless_egg_name = legacy_normalized + '.egg'
self.legacy_normalized = self.legacy_normalize(name)

@staticmethod
def normalize(name):
Expand All @@ -667,27 +697,6 @@ def legacy_normalize(name):
"""
return name.lower().replace('-', '_')

def matches(self, cand, base):
low = cand.lower()
# rpartition is faster than splitext and suitable for this purpose.
pre, _, ext = low.rpartition('.')
name, _, rest = pre.partition('-')
return (
low in self.exact_matches
or ext in self.suffixes
and (not self.normalized or name.replace('.', '_') == self.normalized)
# legacy case:
or self.is_egg(base)
and low == 'egg-info'
)

def is_egg(self, base):
return (
base == self.versionless_egg_name
or base.startswith(self.egg_prefix)
and base.endswith('.egg')
)


@install
class MetadataPathFinder(NullFinder, DistributionFinder):
Expand Down Expand Up @@ -717,6 +726,9 @@ def _search_paths(cls, name, paths):
path.search(prepared) for path in map(FastPath, paths)
)

def invalidate_caches(cls):
FastPath.__new__.cache_clear()


class PathDistribution(Distribution):
def __init__(self, path):
Expand Down
7 changes: 7 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import textwrap
import unittest
import warnings
import importlib

from . import fixtures
from importlib_metadata import (
Expand Down Expand Up @@ -275,3 +276,9 @@ def test_distribution_at_str(self):
dist_info_path = self.site_dir / 'distinfo_pkg-1.0.0.dist-info'
dist = Distribution.at(str(dist_info_path))
assert dist.version == '1.0.0'


class InvalidateCache(unittest.TestCase):
def test_invalidate_cache(self):
# No externally observable behavior, but ensures test coverage...
importlib.invalidate_caches()