Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-28595: Allow shlex whitespace_split with punctuation_chars #2071

Merged
merged 2 commits into from
Jun 1, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 23 additions & 12 deletions Doc/library/shlex.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ variables which either control lexical analysis or can be used for debugging:
appear in filename specifications and command line parameters, will also be
included in this attribute, and any characters which appear in
``punctuation_chars`` will be removed from ``wordchars`` if they are present
there.
there. If :attr:`whitespace_split` is set to ``True``, this will have no
effect.


.. attribute:: shlex.whitespace
Expand Down Expand Up @@ -243,11 +244,13 @@ variables which either control lexical analysis or can be used for debugging:

If ``True``, tokens will only be split in whitespaces. This is useful, for
example, for parsing command lines with :class:`~shlex.shlex`, getting
tokens in a similar way to shell arguments. If this attribute is ``True``,
:attr:`punctuation_chars` will have no effect, and splitting will happen
only on whitespaces. When using :attr:`punctuation_chars`, which is
intended to provide parsing closer to that implemented by shells, it is
advisable to leave ``whitespace_split`` as ``False`` (the default value).
tokens in a similar way to shell arguments. When used in combination with
:attr:`punctuation_chars`, tokens will be split on whitespace in addition to
those characters.

.. versionchanged:: 3.8
The :attr:`punctuation_chars` attribute was made compatible with the
:attr:`whitespace_split` attribute.


.. attribute:: shlex.infile
Expand Down Expand Up @@ -383,12 +386,15 @@ otherwise. To illustrate, you can see the difference in the following snippet:

>>> import shlex
>>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
>>> list(shlex.shlex(text))
['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>',
"'abc'", ';', '(', 'def', '"ghi"', ')']
>>> list(shlex.shlex(text, punctuation_chars=True))
['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'",
';', '(', 'def', '"ghi"', ')']
>>> s = shlex.shlex(text, posix=True)
>>> s.whitespace_split = True
>>> list(s)
['a', '&&', 'b;', 'c', '&&', 'd', '||', 'e;', 'f', '>abc;', '(def', 'ghi)']
>>> s = shlex.shlex(text, posix=True, punctuation_chars=True)
>>> s.whitespace_split = True
>>> list(s)
['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', 'abc', ';',
'(', 'def', 'ghi', ')']

Of course, tokens will be returned which are not valid for shells, and you'll
need to implement your own error checks on the returned tokens.
Expand All @@ -413,6 +419,11 @@ which characters constitute punctuation. For example::
>>> list(s)
['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']

However, to match the shell as closely as possible, it is recommended to
always use ``posix`` and :attr:`~shlex.whitespace_split` when using
:attr:`~shlex.punctuation_chars`, which will negate
:attr:`~shlex.wordchars` entirely.

For best effect, ``punctuation_chars`` should be set in conjunction with
``posix=True``. (Note that ``posix=False`` is the default for
:class:`~shlex.shlex`.)
3 changes: 2 additions & 1 deletion Lib/shlex.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,8 @@ def read_token(self):
escapedstate = 'a'
self.state = nextchar
elif (nextchar in self.wordchars or nextchar in self.quotes
or self.whitespace_split):
or (self.whitespace_split and
nextchar not in self.punctuation_chars)):
self.token += nextchar
else:
if self.punctuation_chars:
Expand Down
46 changes: 36 additions & 10 deletions Lib/test/test_shlex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import io
import itertools
import shlex
import string
import unittest
Expand Down Expand Up @@ -183,10 +184,12 @@ def testSyntaxSplitAmpersandAndPipe(self):
src = ['echo hi %s echo bye' % delimiter,
'echo hi%secho bye' % delimiter]
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
for ss in src:
for ss, ws in itertools.product(src, (False, True)):
s = shlex.shlex(ss, punctuation_chars=True)
s.whitespace_split = ws
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
self.assertEqual(ref, result,
"While splitting '%s' [ws=%s]" % (ss, ws))

def testSyntaxSplitSemicolon(self):
"""Test handling of syntax splitting of ;"""
Expand All @@ -197,10 +200,12 @@ def testSyntaxSplitSemicolon(self):
'echo hi%s echo bye' % delimiter,
'echo hi%secho bye' % delimiter]
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
for ss in src:
for ss, ws in itertools.product(src, (False, True)):
s = shlex.shlex(ss, punctuation_chars=True)
s.whitespace_split = ws
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
self.assertEqual(ref, result,
"While splitting '%s' [ws=%s]" % (ss, ws))

def testSyntaxSplitRedirect(self):
"""Test handling of syntax splitting of >"""
Expand All @@ -211,29 +216,37 @@ def testSyntaxSplitRedirect(self):
'echo hi%s out' % delimiter,
'echo hi%sout' % delimiter]
ref = ['echo', 'hi', delimiter, 'out']
for ss in src:
for ss, ws in itertools.product(src, (False, True)):
s = shlex.shlex(ss, punctuation_chars=True)
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
self.assertEqual(ref, result,
"While splitting '%s' [ws=%s]" % (ss, ws))

def testSyntaxSplitParen(self):
"""Test handling of syntax splitting of ()"""
# these should all parse to the same output
src = ['( echo hi )',
'(echo hi)']
ref = ['(', 'echo', 'hi', ')']
for ss in src:
for ss, ws in itertools.product(src, (False, True)):
s = shlex.shlex(ss, punctuation_chars=True)
s.whitespace_split = ws
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
self.assertEqual(ref, result,
"While splitting '%s' [ws=%s]" % (ss, ws))

def testSyntaxSplitCustom(self):
"""Test handling of syntax splitting with custom chars"""
ss = "~/a&&b-c --color=auto||d *.py?"
ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
ss = "~/a && b-c --color=auto || d *.py?"
s = shlex.shlex(ss, punctuation_chars="|")
result = list(s)
self.assertEqual(ref, result, "While splitting '%s'" % ss)
self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss)
ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?']
s = shlex.shlex(ss, punctuation_chars="|")
s.whitespace_split = True
result = list(s)
self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss)

def testTokenTypes(self):
"""Test that tokens are split with types as expected."""
Expand Down Expand Up @@ -293,6 +306,19 @@ def testEmptyStringHandling(self):
s = shlex.shlex("'')abc", punctuation_chars=True)
self.assertEqual(list(s), expected)

def testUnicodeHandling(self):
"""Test punctuation_chars and whitespace_split handle unicode."""
ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24"
# Should be parsed as one complete token (whitespace_split=True).
ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24']
s = shlex.shlex(ss, punctuation_chars=True)
s.whitespace_split = True
self.assertEqual(list(s), ref)
# Without whitespace_split, uses wordchars and splits on all.
ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24']
s = shlex.shlex(ss, punctuation_chars=True)
self.assertEqual(list(s), ref)

def testQuote(self):
safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s
Expand Down