From 9c8c3f3cb6f53c0d48cadeef4f2d98a5c1958b3a Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 11 Aug 2023 23:39:17 +0100 Subject: [PATCH] Move to `glob.translate()` --- Doc/library/fnmatch.rst | 18 +-- Doc/library/glob.rst | 41 +++++++ Lib/fnmatch.py | 38 +----- Lib/glob.py | 111 ++++++++++++++++++ Lib/pathlib.py | 5 +- Lib/test/test_fnmatch.py | 12 -- Lib/test/test_glob.py | 35 ++++++ ...3-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst | 6 +- 8 files changed, 197 insertions(+), 69 deletions(-) diff --git a/Doc/library/fnmatch.rst b/Doc/library/fnmatch.rst index a0778960d39cea..aed8991d44772f 100644 --- a/Doc/library/fnmatch.rst +++ b/Doc/library/fnmatch.rst @@ -82,7 +82,7 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, ``[n for n in names if fnmatch(n, pattern)]``, but implemented more efficiently. -.. function:: translate(pattern, sep=None) +.. function:: translate(pattern) Return the shell-style *pattern* converted to a regular expression for using with :func:`re.match`. @@ -98,22 +98,6 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`, >>> reobj.match('foobar.txt') - A path separator character may be supplied to the *sep* argument. If given, - the separator is used to split the pattern into segments, where: - - - A ``*`` pattern segment matches precisely one path segment. - - A ``**`` pattern segment matches any number of path segments. - - If ``**`` appears in any other position within the pattern, - :exc:`ValueError` is raised. - - ``*`` and ``?`` wildcards in other positions don't match path separators. - - These rules approximate shell recursive globbing. The :mod:`pathlib` module - calls this function and supplies *sep* to implement - :meth:`~pathlib.PurePath.match` and :meth:`~pathlib.Path.glob`. - - .. versionchanged:: 3.13 - The *sep* parameter was added. - .. seealso:: diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst index 0e4cfe7ebed797..9a8d8f97da8a52 100644 --- a/Doc/library/glob.rst +++ b/Doc/library/glob.rst @@ -145,6 +145,47 @@ default. For example, consider a directory containing :file:`card.gif` and >>> glob.glob('.c*') ['.card.gif'] + +.. function:: translate(pathname, *, recursive=False, seps=None) + + Convert the given path specification to a regular expression for use with + :func:`re.match`. The path specification can contain shell-style wildcards. + + For example: + + >>> import glob, re + >>> + >>> regex = glob.translate('**/*.txt', recursive=True) + >>> regex + '(?s:(?:.*/)?[^/]*\\.txt)\\Z' + >>> reobj = re.compile(regex) + >>> reobj.match('foo/bar/baz.txt') + + + Path separators and segments are meaningful to this function, unlike + :func:`fnmatch.translate`. By default wildcards do not match path + separators, and ``*`` pattern segments match precisely one path segment. + + If *recursive* is true, the pattern segment "``**``" will match any number + of path segments. If "``**``" occurs in any position other than a full + pattern segment, :exc:`ValueError` is raised. + + A sequence of path separators may be supplied to the *seps* argument. If + not given, :data:`os.sep` and :data:`~os.altsep` (if available) are used. + + .. note:: + + Filenames that begin with a dot (``.``) are matched by wildcards, unlike + :func:`glob`. + + .. seealso:: + + :meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob` methods, + which call this function to implement pattern matching and globbing. + + .. versionadded:: 3.13 + + .. seealso:: Module :mod:`fnmatch` diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index a08b8d7bdc2e1f..d5e296f7748c1c 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -71,24 +71,13 @@ def fnmatchcase(name, pat): return match(name) is not None -def translate(pat, sep=None): +def translate(pat): """Translate a shell PATTERN to a regular expression. - A path separator character may be supplied to the *sep* argument. If - given, '*' and '?' wildcards will not match separators; '*' wildcards in - standalone pattern segments match precisely one path segment; and '**' - wildcards in standalone segments match any number of path segments. - There is no way to quote meta-characters. """ STAR = object() - if sep: - SEP = re.escape(sep) - DOT = f'[^{SEP}]' - else: - SEP = None - DOT = '.' res = [] add = res.append i, n = 0, len(pat) @@ -97,29 +86,10 @@ def translate(pat, sep=None): i = i+1 if c == '*': # compress consecutive `*` into one - h = i-1 - while i < n and pat[i] == '*': - i = i+1 - if sep: - star_count = i-h - is_segment = (h == 0 or pat[h-1] == sep) and (i == n or pat[i] == sep) - if star_count == 1: - if is_segment: - add(f'{DOT}+') - else: - add(f'{DOT}*') - elif star_count == 2 and is_segment: - if i == n: - add('.*') - else: - add(f'(.*{SEP})?') - i = i+1 - else: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: + if (not res) or res[-1] is not STAR: add(STAR) elif c == '?': - add(DOT) + add('.') elif c == '[': j = i if j < n and pat[j] == '!': @@ -166,7 +136,7 @@ def translate(pat, sep=None): add('(?!)') elif stuff == '!': # Negated empty range: match any character. - add(DOT) + add('.') else: if stuff[0] == '!': stuff = '^' + stuff[1:] diff --git a/Lib/glob.py b/Lib/glob.py index a7256422d520fb..59110865f182cb 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -249,3 +249,114 @@ def escape(pathname): _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0) + + +def translate(pat, *, recursive=False, seps=None): + """Translate a pathname with shell wildcards to a regular expression. + + If `recursive` is true, the pattern segment '**' will match any number of + path segments; if '**' appears outside its own segment, ValueError will be + raised. + + If a sequence of separator characters is given to `seps`, they will be + used to split the pattern into segments and match path separators. If not + given, os.path.sep and os.path.altsep (where available) are used. + + Filenames beginning with a dot ('.') are NOT special in this method; they + are matched by wildcards, unlike in glob(). + """ + if not seps: + if os.path.altsep: + seps = [os.path.sep, os.path.altsep] + else: + seps = os.path.sep + escaped_seps = ''.join(re.escape(sep) for sep in seps) + any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps + not_sep = f'[^{escaped_seps}]' + res = [] + add = res.append + i, n = 0, len(pat) + while i < n: + c = pat[i] + i = i+1 + if c == '*': + # compress consecutive `*` into one + h = i-1 + while i < n and pat[i] == '*': + i = i+1 + star_count = i-h + is_segment = (h == 0 or pat[h-1] in seps) and (i == n or pat[i] in seps) + if star_count == 1 or not recursive: + if is_segment: + add(f'{not_sep}+') + else: + add(f'{not_sep}*') + elif star_count == 2 and is_segment: + if i == n: + add('.*') + else: + add(f'(?:.*{any_sep})?') + i = i+1 + else: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + elif c in seps: + add(any_sep) + elif c == '?': + add(not_sep) + elif c == '[': + j = i + if j < n and pat[j] == '!': + j = j+1 + if j < n and pat[j] == ']': + j = j+1 + while j < n and pat[j] != ']': + j = j+1 + if j >= n: + add('\\[') + else: + stuff = pat[i:j] + if '-' not in stuff: + stuff = stuff.replace('\\', r'\\') + else: + chunks = [] + k = i+2 if pat[i] == '!' else i+1 + while True: + k = pat.find('-', k, j) + if k < 0: + break + chunks.append(pat[i:k]) + i = k+1 + k = k+3 + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += '-' + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks)-1, 0, -1): + if chunks[k-1][-1] > chunks[k][0]: + chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] + del chunks[k] + # Escape backslashes and hyphens for set difference (--). + # Hyphens that create ranges shouldn't be escaped. + stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') + for s in chunks) + # Escape set operations (&&, ~~ and ||). + stuff = re.sub(r'([&~|])', r'\\\1', stuff) + i = j+1 + if not stuff: + # Empty range: never match. + add('(?!)') + elif stuff == '!': + # Negated empty range: match any character. + add(not_sep) + else: + if stuff[0] == '!': + stuff = '^' + stuff[1:] + elif stuff[0] in ('^', '['): + stuff = '\\' + stuff + add(f'[{stuff}]') + else: + add(re.escape(c)) + res = "".join(res) + return fr'(?s:{res})\Z' diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 3049cfe18e069a..5add4a2d40a3e9 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -5,8 +5,8 @@ operating systems. """ -import fnmatch import functools +import glob import io import ntpath import os @@ -69,7 +69,8 @@ def _compile_pattern(pat, sep, case_sensitive): """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" flags = re.NOFLAG if case_sensitive else re.IGNORECASE - return re.compile(fnmatch.translate(pat, sep), flags).match + regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep) + return re.compile(regex, flags).match def _select_children(parent_paths, dir_only, follow_symlinks, match): diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index db6b7e0bc94237..10ed496d4e2f37 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -250,18 +250,6 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) - def test_translate_sep(self): - self.assertEqual(translate('*', sep='/'), r'(?s:[^/]+)\Z') - self.assertEqual(translate('?', sep='/'), r'(?s:[^/])\Z') - self.assertEqual(translate('a?b*', sep='/'), r'(?s:a[^/]b[^/]*)\Z') - self.assertEqual(translate('/**/*/*.*/**', sep='/'), - r'(?s:/(.*/)?[^/]+/[^/]*\.[^/]*/.*)\Z') - self.assertEqual(translate(r'\**\*\*.*\**', sep='\\'), - r'(?s:\\(.*\\)?[^\\]+\\[^\\]*\.[^\\]*\\.*)\Z') - self.assertRaises(ValueError, translate, 'a**', sep='/') - self.assertRaises(ValueError, translate, '**b', sep='/') - - class FilterTestCase(unittest.TestCase): def test_filter(self): diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py index f4b5821f408cb4..85f82eb6fdd405 100644 --- a/Lib/test/test_glob.py +++ b/Lib/test/test_glob.py @@ -349,6 +349,41 @@ def test_glob_many_open_files(self): for it in iters: self.assertEqual(next(it), p) + def test_translate(self): + def fn(pat): + return glob.translate(pat, seps='/') + self.assertEqual(fn('foo'), r'(?s:foo)\Z') + self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z') + self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('?'), r'(?s:[^/])\Z') + self.assertEqual(fn('a*'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('*a'), r'(?s:[^/]*a)\Z') + self.assertEqual(fn('.*'), r'(?s:\.[^/]*)\Z') + self.assertEqual(fn('?aa'), r'(?s:[^/]aa)\Z') + self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z') + self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z') + self.assertEqual(fn('**'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('***'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z') + self.assertEqual(fn('**b'), r'(?s:[^/]*b)\Z') + self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/[^/]+/[^/]+/[^/]*\.[^/]*/[^/]+)\Z') + + def test_translate_recursive(self): + def fn(pat): + return glob.translate(pat, recursive=True, seps='/') + self.assertEqual(fn('*'), r'(?s:[^/]+)\Z') + self.assertEqual(fn('?'), r'(?s:[^/])\Z') + self.assertEqual(fn('**'), r'(?s:.*)\Z') + self.assertRaises(ValueError, fn, '***') + self.assertRaises(ValueError, fn, 'a**') + self.assertRaises(ValueError, fn, '**b') + self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/(?:.*/)?[^/]+/[^/]*\.[^/]*/.*)\Z') + + def test_translate_seps(self): + def fn(pat): + return glob.translate(pat, recursive=True, seps=['/', '\\']) + self.assertEqual(fn('foo/bar\\baz'), r'(?s:foo[/\\]bar[/\\]baz)\Z') + self.assertEqual(fn('**/**'), r'(?s:(?:.*[/\\])?.*)\Z') @skip_unless_symlink class SymlinkLoopGlobTests(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst index 7be11c7fd86cc3..edc8ab07bb06b3 100644 --- a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst +++ b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst @@ -1,4 +1,2 @@ -Add optional *sep* argument to :func:`fnmatch.translate`. If a path separator -character is given, the resulting pattern matches paths like -:meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob`. For example, the -``*`` wildcard will not match path separators. +Add :func:`glob.translate`. This function converts a pathname with shell-style +wildcards to a regular expression.