Skip to content

Commit

Permalink
Move to glob.translate()
Browse files Browse the repository at this point in the history
  • Loading branch information
barneygale committed Aug 12, 2023
1 parent 51f2698 commit 9c8c3f3
Show file tree
Hide file tree
Showing 8 changed files with 197 additions and 69 deletions.
18 changes: 1 addition & 17 deletions Doc/library/fnmatch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`,
``[n for n in names if fnmatch(n, pattern)]``, but implemented more efficiently.


.. function:: translate(pattern, sep=None)
.. function:: translate(pattern)

Return the shell-style *pattern* converted to a regular expression for
using with :func:`re.match`.
Expand All @@ -98,22 +98,6 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`,
>>> reobj.match('foobar.txt')
<re.Match object; span=(0, 10), match='foobar.txt'>

A path separator character may be supplied to the *sep* argument. If given,
the separator is used to split the pattern into segments, where:

- A ``*`` pattern segment matches precisely one path segment.
- A ``**`` pattern segment matches any number of path segments.
- If ``**`` appears in any other position within the pattern,
:exc:`ValueError` is raised.
- ``*`` and ``?`` wildcards in other positions don't match path separators.

These rules approximate shell recursive globbing. The :mod:`pathlib` module
calls this function and supplies *sep* to implement
:meth:`~pathlib.PurePath.match` and :meth:`~pathlib.Path.glob`.

.. versionchanged:: 3.13
The *sep* parameter was added.


.. seealso::

Expand Down
41 changes: 41 additions & 0 deletions Doc/library/glob.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,47 @@ default. For example, consider a directory containing :file:`card.gif` and
>>> glob.glob('.c*')
['.card.gif']


.. function:: translate(pathname, *, recursive=False, seps=None)

Convert the given path specification to a regular expression for use with
:func:`re.match`. The path specification can contain shell-style wildcards.

For example:

>>> import glob, re
>>>
>>> regex = glob.translate('**/*.txt', recursive=True)
>>> regex
'(?s:(?:.*/)?[^/]*\\.txt)\\Z'
>>> reobj = re.compile(regex)
>>> reobj.match('foo/bar/baz.txt')
<re.Match object; span=(0, 15), match='foo/bar/baz.txt'>

Path separators and segments are meaningful to this function, unlike
:func:`fnmatch.translate`. By default wildcards do not match path
separators, and ``*`` pattern segments match precisely one path segment.

If *recursive* is true, the pattern segment "``**``" will match any number
of path segments. If "``**``" occurs in any position other than a full
pattern segment, :exc:`ValueError` is raised.

A sequence of path separators may be supplied to the *seps* argument. If
not given, :data:`os.sep` and :data:`~os.altsep` (if available) are used.

.. note::

Filenames that begin with a dot (``.``) are matched by wildcards, unlike
:func:`glob`.

.. seealso::

:meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob` methods,
which call this function to implement pattern matching and globbing.

.. versionadded:: 3.13


.. seealso::

Module :mod:`fnmatch`
Expand Down
38 changes: 4 additions & 34 deletions Lib/fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,24 +71,13 @@ def fnmatchcase(name, pat):
return match(name) is not None


def translate(pat, sep=None):
def translate(pat):
"""Translate a shell PATTERN to a regular expression.
A path separator character may be supplied to the *sep* argument. If
given, '*' and '?' wildcards will not match separators; '*' wildcards in
standalone pattern segments match precisely one path segment; and '**'
wildcards in standalone segments match any number of path segments.
There is no way to quote meta-characters.
"""

STAR = object()
if sep:
SEP = re.escape(sep)
DOT = f'[^{SEP}]'
else:
SEP = None
DOT = '.'
res = []
add = res.append
i, n = 0, len(pat)
Expand All @@ -97,29 +86,10 @@ def translate(pat, sep=None):
i = i+1
if c == '*':
# compress consecutive `*` into one
h = i-1
while i < n and pat[i] == '*':
i = i+1
if sep:
star_count = i-h
is_segment = (h == 0 or pat[h-1] == sep) and (i == n or pat[i] == sep)
if star_count == 1:
if is_segment:
add(f'{DOT}+')
else:
add(f'{DOT}*')
elif star_count == 2 and is_segment:
if i == n:
add('.*')
else:
add(f'(.*{SEP})?')
i = i+1
else:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
if (not res) or res[-1] is not STAR:
add(STAR)
elif c == '?':
add(DOT)
add('.')
elif c == '[':
j = i
if j < n and pat[j] == '!':
Expand Down Expand Up @@ -166,7 +136,7 @@ def translate(pat, sep=None):
add('(?!)')
elif stuff == '!':
# Negated empty range: match any character.
add(DOT)
add('.')
else:
if stuff[0] == '!':
stuff = '^' + stuff[1:]
Expand Down
111 changes: 111 additions & 0 deletions Lib/glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,3 +249,114 @@ def escape(pathname):


_dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)


def translate(pat, *, recursive=False, seps=None):
"""Translate a pathname with shell wildcards to a regular expression.
If `recursive` is true, the pattern segment '**' will match any number of
path segments; if '**' appears outside its own segment, ValueError will be
raised.
If a sequence of separator characters is given to `seps`, they will be
used to split the pattern into segments and match path separators. If not
given, os.path.sep and os.path.altsep (where available) are used.
Filenames beginning with a dot ('.') are NOT special in this method; they
are matched by wildcards, unlike in glob().
"""
if not seps:
if os.path.altsep:
seps = [os.path.sep, os.path.altsep]
else:
seps = os.path.sep
escaped_seps = ''.join(re.escape(sep) for sep in seps)
any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
not_sep = f'[^{escaped_seps}]'
res = []
add = res.append
i, n = 0, len(pat)
while i < n:
c = pat[i]
i = i+1
if c == '*':
# compress consecutive `*` into one
h = i-1
while i < n and pat[i] == '*':
i = i+1
star_count = i-h
is_segment = (h == 0 or pat[h-1] in seps) and (i == n or pat[i] in seps)
if star_count == 1 or not recursive:
if is_segment:
add(f'{not_sep}+')
else:
add(f'{not_sep}*')
elif star_count == 2 and is_segment:
if i == n:
add('.*')
else:
add(f'(?:.*{any_sep})?')
i = i+1
else:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
elif c in seps:
add(any_sep)
elif c == '?':
add(not_sep)
elif c == '[':
j = i
if j < n and pat[j] == '!':
j = j+1
if j < n and pat[j] == ']':
j = j+1
while j < n and pat[j] != ']':
j = j+1
if j >= n:
add('\\[')
else:
stuff = pat[i:j]
if '-' not in stuff:
stuff = stuff.replace('\\', r'\\')
else:
chunks = []
k = i+2 if pat[i] == '!' else i+1
while True:
k = pat.find('-', k, j)
if k < 0:
break
chunks.append(pat[i:k])
i = k+1
k = k+3
chunk = pat[i:j]
if chunk:
chunks.append(chunk)
else:
chunks[-1] += '-'
# Remove empty ranges -- invalid in RE.
for k in range(len(chunks)-1, 0, -1):
if chunks[k-1][-1] > chunks[k][0]:
chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:]
del chunks[k]
# Escape backslashes and hyphens for set difference (--).
# Hyphens that create ranges shouldn't be escaped.
stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
for s in chunks)
# Escape set operations (&&, ~~ and ||).
stuff = re.sub(r'([&~|])', r'\\\1', stuff)
i = j+1
if not stuff:
# Empty range: never match.
add('(?!)')
elif stuff == '!':
# Negated empty range: match any character.
add(not_sep)
else:
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] in ('^', '['):
stuff = '\\' + stuff
add(f'[{stuff}]')
else:
add(re.escape(c))
res = "".join(res)
return fr'(?s:{res})\Z'
5 changes: 3 additions & 2 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
operating systems.
"""

import fnmatch
import functools
import glob
import io
import ntpath
import os
Expand Down Expand Up @@ -69,7 +69,8 @@ def _compile_pattern(pat, sep, case_sensitive):
"""Compile given glob pattern to a re.Pattern object (observing case
sensitivity)."""
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
return re.compile(fnmatch.translate(pat, sep), flags).match
regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep)
return re.compile(regex, flags).match


def _select_children(parent_paths, dir_only, follow_symlinks, match):
Expand Down
12 changes: 0 additions & 12 deletions Lib/test/test_fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,18 +250,6 @@ def test_translate(self):
self.assertTrue(re.match(fatre, 'cbabcaxc'))
self.assertFalse(re.match(fatre, 'dabccbad'))

def test_translate_sep(self):
self.assertEqual(translate('*', sep='/'), r'(?s:[^/]+)\Z')
self.assertEqual(translate('?', sep='/'), r'(?s:[^/])\Z')
self.assertEqual(translate('a?b*', sep='/'), r'(?s:a[^/]b[^/]*)\Z')
self.assertEqual(translate('/**/*/*.*/**', sep='/'),
r'(?s:/(.*/)?[^/]+/[^/]*\.[^/]*/.*)\Z')
self.assertEqual(translate(r'\**\*\*.*\**', sep='\\'),
r'(?s:\\(.*\\)?[^\\]+\\[^\\]*\.[^\\]*\\.*)\Z')
self.assertRaises(ValueError, translate, 'a**', sep='/')
self.assertRaises(ValueError, translate, '**b', sep='/')


class FilterTestCase(unittest.TestCase):

def test_filter(self):
Expand Down
35 changes: 35 additions & 0 deletions Lib/test/test_glob.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,41 @@ def test_glob_many_open_files(self):
for it in iters:
self.assertEqual(next(it), p)

def test_translate(self):
def fn(pat):
return glob.translate(pat, seps='/')
self.assertEqual(fn('foo'), r'(?s:foo)\Z')
self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z')
self.assertEqual(fn('*'), r'(?s:[^/]+)\Z')
self.assertEqual(fn('?'), r'(?s:[^/])\Z')
self.assertEqual(fn('a*'), r'(?s:a[^/]*)\Z')
self.assertEqual(fn('*a'), r'(?s:[^/]*a)\Z')
self.assertEqual(fn('.*'), r'(?s:\.[^/]*)\Z')
self.assertEqual(fn('?aa'), r'(?s:[^/]aa)\Z')
self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z')
self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z')
self.assertEqual(fn('**'), r'(?s:[^/]+)\Z')
self.assertEqual(fn('***'), r'(?s:[^/]+)\Z')
self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z')
self.assertEqual(fn('**b'), r'(?s:[^/]*b)\Z')
self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/[^/]+/[^/]+/[^/]*\.[^/]*/[^/]+)\Z')

def test_translate_recursive(self):
def fn(pat):
return glob.translate(pat, recursive=True, seps='/')
self.assertEqual(fn('*'), r'(?s:[^/]+)\Z')
self.assertEqual(fn('?'), r'(?s:[^/])\Z')
self.assertEqual(fn('**'), r'(?s:.*)\Z')
self.assertRaises(ValueError, fn, '***')
self.assertRaises(ValueError, fn, 'a**')
self.assertRaises(ValueError, fn, '**b')
self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/(?:.*/)?[^/]+/[^/]*\.[^/]*/.*)\Z')

def test_translate_seps(self):
def fn(pat):
return glob.translate(pat, recursive=True, seps=['/', '\\'])
self.assertEqual(fn('foo/bar\\baz'), r'(?s:foo[/\\]bar[/\\]baz)\Z')
self.assertEqual(fn('**/**'), r'(?s:(?:.*[/\\])?.*)\Z')

@skip_unless_symlink
class SymlinkLoopGlobTests(unittest.TestCase):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
Add optional *sep* argument to :func:`fnmatch.translate`. If a path separator
character is given, the resulting pattern matches paths like
:meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob`. For example, the
``*`` wildcard will not match path separators.
Add :func:`glob.translate`. This function converts a pathname with shell-style
wildcards to a regular expression.

0 comments on commit 9c8c3f3

Please sign in to comment.