Skip to content

Commit

Permalink
Todo list is done! 100% coverage!
Browse files Browse the repository at this point in the history
  • Loading branch information
larryhastings committed Sep 1, 2024
1 parent 1dfff9d commit f01cb86
Show file tree
Hide file tree
Showing 2 changed files with 295 additions and 70 deletions.
131 changes: 94 additions & 37 deletions big/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,13 +245,34 @@ def encode_strings(o, encoding='ascii'):
# False
# >>> is_newline_byte(b'\v')
# False
#
# however! with defensive programming, in case this changes in the future
# (as it should!), big will automatically still agree with Python.
#
# p.s. you have to put characters around the linebreak character,
# because str.splitlines (and bytes.splitlines) rstrips the linebreak
# characters before it splits, sigh.

_export_name('bytes_linebreaks')
bytes_linebreaks = (
b'\n' , # 10 0x000a - newline
)

if len(b'x\vx'.splitlines()) == 2: # pragma: nocover
bytes_linebreaks += (
'\v' , # 11 - 0x000b - vertical tab
)

if len(b'x\fx'.splitlines()) == 2: # pragma: nocover
bytes_linebreaks += (
'\f' , # 12 - 0x000c - form feed
)

bytes_linebreaks += (
b'\r' , # 13 0x000d - carriage return
b'\r\n' , # bonus! the classic DOS newline sequence!
)

_export_name('bytes_linebreaks_without_crlf')
bytes_linebreaks_without_crlf = tuple(s for s in bytes_linebreaks if s != b'\r\n')

Expand Down Expand Up @@ -2010,6 +2031,13 @@ def old_split_quoted_strings(s, quotes=None, *, triple_quotes=True, backslash=No
## One added benefit of this approach: it works on both str and bytes objects, you don't need to
## handle them separately.
##
## Update: OOOOPS! s.splitlines() implicitly does an s.rstrip(newline-characters) before splitting!
## Hooray for special cases breaking the rules!
##
## So now I have to do this more complicated version:
## contains_newlines = (len( s.splitlines() ) > 1) or (len( ( s[-1:] + 'x' ').splitlines() ) > 1)
## (Why the colon in [-1:] ? So it works on bytes strings. yes, we also have to use b'x' then.)
##

_sqs_quotes_str = ( '"', "'")
_sqs_quotes_bytes = (b'"', b"'")
Expand All @@ -2018,7 +2046,7 @@ def old_split_quoted_strings(s, quotes=None, *, triple_quotes=True, backslash=No
_sqs_escape_bytes = b'\\'


def split_quoted_strings(s, separators, all_quotes_set, quotes, multiline_quotes, empty, state):
def split_quoted_strings(s, separators, all_quotes_set, quotes, multiline_quotes, empty, laden, state):
"""
This is the generator function implementing the split_quoted_strings
iterator. The public split_quoted_strings analyzes its arguments,
Expand Down Expand Up @@ -2055,7 +2083,7 @@ def split_quoted_strings(s, separators, all_quotes_set, quotes, multiline_quotes
if quote or text:
if quote and text and (quote not in multiline_quotes):
# see treatise above
if len(text.splitlines()) > 1:
if (len(text.splitlines()) > 1) or (len( (text[-1:] + laden).splitlines()) > 1):
raise SyntaxError("unterminated quoted string, {s!r}")
if state:
state = None
Expand All @@ -2072,7 +2100,7 @@ def split_quoted_strings(s, separators, all_quotes_set, quotes, multiline_quotes
if text or quote:
if quote and text and (quote not in multiline_quotes):
# see treatise above
if len(text.splitlines()) > 1:
if (len(text.splitlines()) > 1) or (len( (text[-1:] + laden).splitlines()) > 1):
raise SyntaxError("unterminated quoted string, {s!r}")
if state:
state = None
Expand Down Expand Up @@ -2102,9 +2130,12 @@ def split_quoted_strings(s, quotes=_sqs_quotes_str, *, escape=_sqs_escape_str, m
quotes is an iterable of unique quote delimiters.
Quote delimiters may be any string of 1 or more characters.
They must be the same type as s, either str or bytes.
By default, quotes is ('"', "'"). (If s is bytes,
quotes defaults to (b'"', b"'").) Text delimited
inside quotes must not contain a newline.
When one of these quote delimiters is encountered in s,
it begins a quoted section, which only ends at the
next occurance of that quote delimiter. By default,
quotes is ('"', "'"). (If s is bytes, quotes defaults
to (b'"', b"'").) Text delimited inside quotes must
not contain a newline.
escape is a string of any length. If escape is not
an empty string, the string will "escape" (quote)
Expand Down Expand Up @@ -2149,9 +2180,13 @@ def split_quoted_strings(s, quotes=_sqs_quotes_str, *, escape=_sqs_escape_str, m
if s ends with an unterminated string. In that
case, the last tuple yielded will have a non-empty
leading_quote and an empty trailing_quote.
* split_quoted_strings only supports the opening and
closing marker for a string being the same string.
If you need the opening and closing markers to be
different strings, use split_delimiters.
"""

# print(f"split_quoted_strings({s=}, {quotes=}, *, {escape=}, {state=})")
# print(f"split_quoted_strings({s=}, {quotes=}, *, {escape=}, {multiline_quotes=}, {state=})")

if multiline_quotes is None:
multiline_quotes = ()
Expand All @@ -2160,6 +2195,7 @@ def split_quoted_strings(s, quotes=_sqs_quotes_str, *, escape=_sqs_escape_str, m
if is_bytes:
s_type = bytes
empty = b''
laden = b'x'
if quotes in (_sqs_quotes_str, None):
quotes = _sqs_quotes_bytes
else:
Expand All @@ -2181,6 +2217,7 @@ def split_quoted_strings(s, quotes=_sqs_quotes_str, *, escape=_sqs_escape_str, m
else:
s_type = str
empty = ""
laden = 'x'
if quotes in (_sqs_quotes_bytes, None):
quotes = _sqs_quotes_str
else:
Expand All @@ -2191,6 +2228,12 @@ def split_quoted_strings(s, quotes=_sqs_quotes_str, *, escape=_sqs_escape_str, m
raise ValueError("quotes cannot contain an empty string")
if escape in (_sqs_escape_bytes, None):
escape = _sqs_escape_str
if multiline_quotes:
for q in multiline_quotes:
if not isinstance(q, s_type):
raise TypeError(f"values in multiline_quotes must match s (str or bytes), not {q!r}")
if not q:
raise ValueError("multiline_quotes cannot contain an empty string")
elif not isinstance(escape, s_type):
raise TypeError(f"escape must match s (str or bytes), not {escape!r}")

Expand Down Expand Up @@ -2252,7 +2295,7 @@ def split_quoted_strings(s, quotes=_sqs_quotes_str, *, escape=_sqs_escape_str, m
# help multisplit work better--it memoizes the conversion to a regular expression
separators.sort()

return _split_quoted_strings(s, separators, all_quotes_set, quotes_set, multiline_quotes_set, empty, state)
return _split_quoted_strings(s, separators, all_quotes_set, quotes_set, multiline_quotes_set, empty, laden, state)


@_export
Expand Down Expand Up @@ -2283,14 +2326,22 @@ def __init__(self, close, *, escape='', multiline=True, quoting=False):
if is_bytes:
t = bytes
empty = b''
if close == b'\\':
raise ValueError("close delimiter must not be '\\'")
else:
t = str
empty = ''
if close == '\\':
raise ValueError("close delimiter must not be b'\\'")

# they can't both be false, and they can't both be true
if bool(escape) != bool(quoting):
raise ValueError("quoting and escape mismatch; they must either both be true, or both be false")

# if quoting=False, you can only have multiline=True
if not (quoting or multiline):
raise ValueError(f"multiline=False unsupported when quoting=False")

self._close = close
self._escape = escape or empty
self._quoting = quoting
Expand Down Expand Up @@ -2380,7 +2431,7 @@ def __init__(self, open={}, close=(), escape='', illegal={}, single_line_only=Fa
self.illegal = illegal
self.single_line_only = single_line_only

def __repr__(self):
def __repr__(self): # pragma: nocover
return f"DelimiterState(open={self.open!r}, close={self.close!r}, escape={self.escape!r}, illegal={self.illegal!r}, single_line_only={self.single_line_only!r})"


Expand All @@ -2401,12 +2452,10 @@ def _delimiters_to_state_machine(delimiters, is_bytes):
s_type = bytes
s_type_description = "bytes"
not_s_type_description = "str"
disallowed_delimiter = b'\\'
else:
s_type = str
s_type_description = "str"
not_s_type_description = "bytes"
disallowed_delimiter = '\\'

all_closers = set()
all_openers = set(delimiters)
Expand All @@ -2415,14 +2464,10 @@ def _delimiters_to_state_machine(delimiters, is_bytes):
for k, v in delimiters.items():
if not isinstance(k, s_type):
raise TypeError(f"open delimiter {k!r} must be {s_type_description}, not {not_s_type_description}")
if k == disallowed_delimiter:
raise ValueError(f"illegal open delimiter {k!r}")
if not isinstance(v, Delimiter):
raise TypeError(f"delimiter values must be Delimiter, not {v!r}")
if not isinstance(v.close, s_type):
raise TypeError(f"close delimiter {v.close!r} must be {s_type_description}, not {not_s_type_description}")
if v.close == disallowed_delimiter:
raise ValueError(f"Delimiter: illegal close delimiter {v.close!r}")
all_closers.add(v.close)
if not isinstance(v.escape, s_type):
raise TypeError(f"Delimiter: escape {v.escape!r} must be {s_type_description}, not {not_s_type_description}")
Expand Down Expand Up @@ -2465,7 +2510,7 @@ def _delimiters_to_state_machine(delimiters, is_bytes):
return initial_state, all_tokens


def split_delimiters(s, all_tokens, current, stack, empty):
def split_delimiters(s, all_tokens, current, stack, empty, laden):
"Internal generator function returned by the real split_delimiters."
push = stack.append
pop = stack.pop
Expand All @@ -2492,9 +2537,9 @@ def split_delimiters(s, all_tokens, current, stack, empty):
# flush open delimiter
s = join(text)
clear()
# see treatise above
if current.single_line_only and (len(s.splitlines()) > 1):
raise SyntaxError("unterminated quoted string, {s!r}")
# we don't need to test to see if s contains a newline here.
# if we have an open delimiter, that means quoting is False.
# if quoting is False, multiline must be True. this is a Delimiter invariant.
yield s, delimiter, empty

push(current)
Expand All @@ -2505,9 +2550,10 @@ def split_delimiters(s, all_tokens, current, stack, empty):
# flush close delimiter
s = join(text)
clear()
# see treatise above
if current.single_line_only and (len(s.splitlines()) > 1):
raise SyntaxError("unterminated quoted string, {s!r}")
if s:
# see treatise above
if current.single_line_only and ((len(s.splitlines()) > 1) or (len( (s[-1:] + laden).splitlines()) > 1)):
raise SyntaxError(f"unterminated quoted string, {s!r}")
yield s, empty, delimiter

current = pop()
Expand All @@ -2527,8 +2573,8 @@ def split_delimiters(s, all_tokens, current, stack, empty):
s = join(text)
if s:
# see treatise above
if current.single_line_only and (len(s.splitlines()) > 1):
raise SyntaxError("unterminated quoted string, {s!r}")
if current.single_line_only and ((len(s.splitlines()) > 1) or (len( (s[-1:] + laden).splitlines()) > 1)):
raise SyntaxError(f"unterminated quoted string, {s!r}")
yield s, empty, empty


Expand Down Expand Up @@ -2572,15 +2618,19 @@ def split_delimiters(s, delimiters=split_delimiters_default_delimiters, *, state
If s doesn't end with a closing delimiter, in the final tuple
yielded, both open and close will be empty strings.
(Tip: Use a list as a stack to track the state of split_delimiters.
Every time split_delimiters yields a tuple, first process text.
Then, if open is true, push that string with stack.append.
Else, if close is true, pop the stack with stack.pop.)
split_delimiters doesn't publish its internal state, but it's
easy to track. Use a list as a stack to track the state,
like so:
* Create an empty list to store the state.
* Every time split_delimiters yields a tuple, first,
process the text.
* Then, if open is true, push that string with
stack.append.
* Else, if close is true, pop the stack with stack.pop.
You may reuse a particular character as a closing
delimiter multiple times.
You may use multiple Delimiter objects with the same close string.
You may not specify backslash ('\\') as a delimiter.
You may not specify backslash ('\\') as an open or close delimiter.
parse_delimiter doesn't complain if a string ends with unclosed
delimiters.
Expand All @@ -2592,18 +2642,25 @@ def split_delimiters(s, delimiters=split_delimiters_default_delimiters, *, state

is_bytes = isinstance(s, bytes)
if is_bytes:
empty = b''
laden = b'x'
if delimiters in (None, split_delimiters_default_delimiters):
initial_state, all_tokens = _split_delimiters_default_delimiters_bytes_cache
empty = b''
elif not delimiters:
raise ValueError("invalid delimiters")
elif b'\\' in delimiters:
raise ValueError("open delimiter must not be b'\\'")
else:
empty = ''
laden = 'x'
if delimiters in (None, split_delimiters_default_delimiters_bytes):
initial_state, all_tokens = _split_delimiters_default_delimiters_cache
empty = ''

if not initial_state:
if not delimiters:
elif not delimiters:
raise ValueError("invalid delimiters")
elif '\\' in delimiters:
raise ValueError("open delimiter must not be '\\'")

if not initial_state:
initial_state, all_tokens = _delimiters_to_state_machine(delimiters, is_bytes)

stack = []
Expand All @@ -2624,7 +2681,7 @@ def split_delimiters(s, delimiters=split_delimiters_default_delimiters, *, state
current = next
last_open = open

return _split_delimiters(s, all_tokens, current, stack, empty)
return _split_delimiters(s, all_tokens, current, stack, empty, laden)



Expand Down
Loading

0 comments on commit f01cb86

Please sign in to comment.