Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[3.12] GH-105113: Improve performance of pathlib.PurePath.match() #105114

Merged
merged 1 commit into from
May 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Doc/library/pathlib.rst
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
>>> PurePath('a/b.py').match('/*.py')
False

The *pattern* may be another path object; this speeds up matching the same
pattern against multiple files::

>>> pattern = PurePath('*.py')
>>> PurePath('a/b.py').match(pattern)
True

As with other methods, case-sensitivity follows platform defaults::

>>> PurePosixPath('b.py').match('*.PY')
Expand Down
86 changes: 72 additions & 14 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,30 @@ def _ignore_error(exception):
getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)


@functools.cache
def _is_case_sensitive(flavour):
return flavour.normcase('Aa') == 'Aa'

#
# Globbing helpers
#


# fnmatch.translate() returns a regular expression that includes a prefix and
# a suffix, which enable matching newlines and ensure the end of the string is
# matched, respectively. These features are undesirable for our implementation
# of PurePatch.match(), which represents path separators as newlines and joins
# pattern segments together. As a workaround, we define a slice object that
# can remove the prefix and suffix from any translate() result. See the
# _compile_pattern_lines() function for more details.
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
_SWAP_SEP_AND_NEWLINE = {
'/': str.maketrans({'/': '\n', '\n': '/'}),
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
}


@functools.lru_cache()
def _make_selector(pattern_parts, flavour, case_sensitive):
pat = pattern_parts[0]
Expand Down Expand Up @@ -92,6 +109,38 @@ def _compile_pattern(pat, case_sensitive):
return re.compile(fnmatch.translate(pat), flags).match


@functools.lru_cache()
def _compile_pattern_lines(pattern_lines, case_sensitive):
"""Compile the given pattern lines to an `re.Pattern` object.

The *pattern_lines* argument is a glob-style pattern (e.g. '*/*.py') with
its path separators and newlines swapped (e.g. '*\n*.py`). By using
newlines to separate path components, and not setting `re.DOTALL`, we
ensure that the `*` wildcard cannot match path separators.

The returned `re.Pattern` object may have its `match()` method called to
match a complete pattern, or `search()` to match from the right. The
argument supplied to these methods must also have its path separators and
newlines swapped.
"""

# Match the start of the path, or just after a path separator
parts = ['^']
for part in pattern_lines.splitlines(keepends=True):
# We slice off the common prefix and suffix added by translate() to
# ensure that re.DOTALL is not set, and the end of the string not
# matched, respectively. With DOTALL not set, '*' wildcards will not
# match path separators, because the '.' characters in the pattern
# will not match newlines.
parts.append(fnmatch.translate(part)[_FNMATCH_SLICE])
# Match the end of the path, always.
parts.append(r'\Z')
flags = re.MULTILINE
if not case_sensitive:
flags |= re.IGNORECASE
return re.compile(''.join(parts), flags=flags)


class _Selector:
"""A selector matches a specific glob pattern part against the children
of a given path."""
Expand Down Expand Up @@ -274,6 +323,10 @@ class PurePath(object):
# to implement comparison methods like `__lt__()`.
'_parts_normcase_cached',

# The `_lines_cached` slot stores the string path with path separators
# and newlines swapped. This is used to implement `match()`.
'_lines_cached',

# The `_hash` slot stores the hash of the case-normalized string
# path. It's set when `__hash__()` is called for the first time.
'_hash',
Expand Down Expand Up @@ -439,6 +492,16 @@ def _parts_normcase(self):
self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
return self._parts_normcase_cached

@property
def _lines(self):
# Path with separators and newlines swapped, for pattern matching.
try:
return self._lines_cached
except AttributeError:
trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
self._lines_cached = str(self).translate(trans)
return self._lines_cached

def __eq__(self, other):
if not isinstance(other, PurePath):
return NotImplemented
Expand Down Expand Up @@ -695,23 +758,18 @@ def match(self, path_pattern, *, case_sensitive=None):
"""
Return True if this path matches the given pattern.
"""
if not isinstance(path_pattern, PurePath):
path_pattern = self.with_segments(path_pattern)
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self._flavour)
pat = self.with_segments(path_pattern)
if not pat.parts:
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
if path_pattern.drive or path_pattern.root:
return pattern.match(self._lines) is not None
elif path_pattern._tail:
return pattern.search(self._lines) is not None
else:
raise ValueError("empty pattern")
pat_parts = pat.parts
parts = self.parts
if pat.drive or pat.root:
if len(pat_parts) != len(parts):
return False
elif len(pat_parts) > len(parts):
return False
for part, pat in zip(reversed(parts), reversed(pat_parts)):
match = _compile_pattern(pat, case_sensitive)
if not match(part):
return False
return True


# Can't subclass os.PathLike from PurePath and keep the constructor
# optimizations in PurePath.__slots__.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve performance of :meth:`pathlib.PurePath.match` by compiling an
:class:`re.Pattern` object for the entire pattern.