-
-
Notifications
You must be signed in to change notification settings - Fork 31k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively #105070
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
7caac01
gh-105069: Add a readline-like callable to the tokenizer to consume i…
pablogsal 8903d0d
fixup! gh-105069: Add a readline-like callable to the tokenizer to co…
pablogsal d370087
fixup! fixup! gh-105069: Add a readline-like callable to the tokenize…
pablogsal 2d6f0a6
fixup! fixup! fixup! gh-105069: Add a readline-like callable to the t…
pablogsal 0935371
fixup! fixup! fixup! fixup! gh-105069: Add a readline-like callable t…
pablogsal 9990b7e
fixup! fixup! fixup! fixup! fixup! gh-105069: Add a readline-like cal…
pablogsal 0598127
fixup! fixup! fixup! fixup! fixup! fixup! gh-105069: Add a readline-l…
pablogsal File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from test import support | ||
from test.support import os_helper | ||
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, | ||
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP, | ||
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, | ||
open as tokenize_open, Untokenizer, generate_tokens, | ||
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo) | ||
|
@@ -51,6 +51,25 @@ def check_tokenize(self, s, expected): | |
[" ENCODING 'utf-8' (0, 0) (0, 0)"] + | ||
expected.rstrip().splitlines()) | ||
|
||
def test_invalid_readline(self): | ||
def gen(): | ||
yield "sdfosdg" | ||
yield "sdfosdg" | ||
with self.assertRaises(TypeError): | ||
list(tokenize(gen().__next__)) | ||
|
||
def gen(): | ||
yield b"sdfosdg" | ||
yield b"sdfosdg" | ||
with self.assertRaises(TypeError): | ||
list(generate_tokens(gen().__next__)) | ||
|
||
def gen(): | ||
yield "sdfosdg" | ||
1/0 | ||
with self.assertRaises(ZeroDivisionError): | ||
list(generate_tokens(gen().__next__)) | ||
|
||
def test_implicit_newline(self): | ||
# Make sure that the tokenizer puts in an implicit NEWLINE | ||
# when the input lacks a trailing new line. | ||
|
@@ -1154,7 +1173,8 @@ class TestTokenizerAdheresToPep0263(TestCase): | |
|
||
def _testFile(self, filename): | ||
path = os.path.join(os.path.dirname(__file__), filename) | ||
TestRoundtrip.check_roundtrip(self, open(path, 'rb')) | ||
with open(path, 'rb') as f: | ||
TestRoundtrip.check_roundtrip(self, f) | ||
|
||
def test_utf8_coding_cookie_and_no_utf8_bom(self): | ||
f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt' | ||
|
@@ -1199,7 +1219,8 @@ def readline(): | |
yield b'' | ||
|
||
# skip the initial encoding token and the end tokens | ||
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2] | ||
tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8', | ||
extra_tokens=True))[:-2] | ||
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')] | ||
self.assertEqual(tokens, expected_tokens, | ||
"bytes not decoded with encoding") | ||
|
@@ -1468,13 +1489,13 @@ def test_tokenize(self): | |
def mock_detect_encoding(readline): | ||
return encoding, [b'first', b'second'] | ||
|
||
def mock__tokenize(readline, encoding): | ||
def mock__tokenize(readline, encoding, **kwargs): | ||
nonlocal encoding_used | ||
encoding_used = encoding | ||
out = [] | ||
while True: | ||
try: | ||
next_line = next(readline) | ||
next_line = readline() | ||
except StopIteration: | ||
return out | ||
if next_line: | ||
|
@@ -1491,16 +1512,16 @@ def mock_readline(): | |
return str(counter).encode() | ||
|
||
orig_detect_encoding = tokenize_module.detect_encoding | ||
orig__tokenize = tokenize_module._tokenize | ||
orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer | ||
tokenize_module.detect_encoding = mock_detect_encoding | ||
tokenize_module._tokenize = mock__tokenize | ||
tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize | ||
try: | ||
results = tokenize(mock_readline) | ||
self.assertEqual(list(results)[1:], | ||
[b'first', b'second', b'1', b'2', b'3', b'4']) | ||
finally: | ||
tokenize_module.detect_encoding = orig_detect_encoding | ||
tokenize_module._tokenize = orig__tokenize | ||
tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token | ||
|
||
self.assertEqual(encoding_used, encoding) | ||
|
||
|
@@ -1827,12 +1848,33 @@ class CTokenizeTest(TestCase): | |
def check_tokenize(self, s, expected): | ||
# Format the tokens in s in a table format. | ||
# The ENDMARKER and final NEWLINE are omitted. | ||
f = StringIO(s) | ||
with self.subTest(source=s): | ||
result = stringify_tokens_from_source( | ||
_generate_tokens_from_c_tokenizer(s), s | ||
_generate_tokens_from_c_tokenizer(f.readline), s | ||
) | ||
self.assertEqual(result, expected.rstrip().splitlines()) | ||
|
||
def test_encoding(self): | ||
def readline(encoding): | ||
yield "1+1".encode(encoding) | ||
|
||
expected = [ | ||
TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'), | ||
TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'), | ||
TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'), | ||
TokenInfo(type=NEWLINE, string='\n', start=(1, 3), end=(1, 4), line='1+1\n'), | ||
TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') | ||
] | ||
for encoding in ["utf-8", "latin-1", "utf-16"]: | ||
with self.subTest(encoding=encoding): | ||
tokens = list(_generate_tokens_from_c_tokenizer( | ||
readline(encoding).__next__, | ||
extra_tokens=True, | ||
encoding=encoding, | ||
)) | ||
self.assertEqual(tokens, expected) | ||
|
||
def test_int(self): | ||
|
||
self.check_tokenize('0xff <= 255', """\ | ||
|
@@ -2668,43 +2710,44 @@ def test_unicode(self): | |
|
||
def test_invalid_syntax(self): | ||
def get_tokens(string): | ||
return list(_generate_tokens_from_c_tokenizer(string)) | ||
|
||
self.assertRaises(SyntaxError, get_tokens, "(1+2]") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was bothering me 😅 |
||
self.assertRaises(SyntaxError, get_tokens, "(1+2}") | ||
self.assertRaises(SyntaxError, get_tokens, "{1+2]") | ||
|
||
self.assertRaises(SyntaxError, get_tokens, "1_") | ||
self.assertRaises(SyntaxError, get_tokens, "1.2_") | ||
self.assertRaises(SyntaxError, get_tokens, "1e2_") | ||
self.assertRaises(SyntaxError, get_tokens, "1e+") | ||
|
||
self.assertRaises(SyntaxError, get_tokens, "\xa0") | ||
self.assertRaises(SyntaxError, get_tokens, "€") | ||
|
||
self.assertRaises(SyntaxError, get_tokens, "0b12") | ||
self.assertRaises(SyntaxError, get_tokens, "0b1_2") | ||
self.assertRaises(SyntaxError, get_tokens, "0b2") | ||
self.assertRaises(SyntaxError, get_tokens, "0b1_") | ||
self.assertRaises(SyntaxError, get_tokens, "0b") | ||
self.assertRaises(SyntaxError, get_tokens, "0o18") | ||
self.assertRaises(SyntaxError, get_tokens, "0o1_8") | ||
self.assertRaises(SyntaxError, get_tokens, "0o8") | ||
self.assertRaises(SyntaxError, get_tokens, "0o1_") | ||
self.assertRaises(SyntaxError, get_tokens, "0o") | ||
self.assertRaises(SyntaxError, get_tokens, "0x1_") | ||
self.assertRaises(SyntaxError, get_tokens, "0x") | ||
self.assertRaises(SyntaxError, get_tokens, "1_") | ||
self.assertRaises(SyntaxError, get_tokens, "012") | ||
self.assertRaises(SyntaxError, get_tokens, "1.2_") | ||
self.assertRaises(SyntaxError, get_tokens, "1e2_") | ||
self.assertRaises(SyntaxError, get_tokens, "1e+") | ||
|
||
self.assertRaises(SyntaxError, get_tokens, "'sdfsdf") | ||
self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''") | ||
|
||
self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000) | ||
self.assertRaises(SyntaxError, get_tokens, "]") | ||
the_string = StringIO(string) | ||
return list(_generate_tokens_from_c_tokenizer(the_string.readline)) | ||
|
||
for case in [ | ||
"(1+2]", | ||
"(1+2}", | ||
"{1+2]", | ||
"1_", | ||
"1.2_", | ||
"1e2_", | ||
"1e+", | ||
|
||
"\xa0", | ||
"€", | ||
"0b12", | ||
"0b1_2", | ||
"0b2", | ||
"0b1_", | ||
"0b", | ||
"0o18", | ||
"0o1_8", | ||
"0o8", | ||
"0o1_", | ||
"0o", | ||
"0x1_", | ||
"0x", | ||
"1_", | ||
"012", | ||
"1.2_", | ||
"1e2_", | ||
"1e+", | ||
"'sdfsdf", | ||
"'''sdfsdf''", | ||
"("*1000+"a"+")"*1000, | ||
"]", | ||
]: | ||
with self.subTest(case=case): | ||
self.assertRaises(SyntaxError, get_tokens, case) | ||
|
||
def test_max_indent(self): | ||
MAXINDENT = 100 | ||
|
@@ -2715,20 +2758,24 @@ def generate_source(indents): | |
return source | ||
|
||
valid = generate_source(MAXINDENT - 1) | ||
tokens = list(_generate_tokens_from_c_tokenizer(valid)) | ||
the_input = StringIO(valid) | ||
tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline)) | ||
self.assertEqual(tokens[-2].type, DEDENT) | ||
self.assertEqual(tokens[-1].type, ENDMARKER) | ||
compile(valid, "<string>", "exec") | ||
|
||
invalid = generate_source(MAXINDENT) | ||
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid))) | ||
the_input = StringIO(invalid) | ||
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline))) | ||
self.assertRaises( | ||
IndentationError, compile, invalid, "<string>", "exec" | ||
) | ||
|
||
def test_continuation_lines_indentation(self): | ||
def get_tokens(string): | ||
return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)] | ||
the_string = StringIO(string) | ||
return [(kind, string) for (kind, string, *_) | ||
in _generate_tokens_from_c_tokenizer(the_string.readline)] | ||
|
||
code = dedent(""" | ||
def fib(n): | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,7 @@ | |
import sys | ||
from token import * | ||
from token import EXACT_TOKEN_TYPES | ||
import _tokenize | ||
|
||
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) | ||
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) | ||
|
@@ -443,29 +444,15 @@ def tokenize(readline): | |
# BOM will already have been stripped. | ||
encoding = "utf-8" | ||
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') | ||
yield from _tokenize(rl_gen, encoding) | ||
|
||
def _tokenize(rl_gen, encoding): | ||
source = b"".join(rl_gen).decode(encoding) | ||
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): | ||
yield token | ||
yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True) | ||
|
||
def generate_tokens(readline): | ||
"""Tokenize a source reading Python code as unicode strings. | ||
This has the same API as tokenize(), except that it expects the *readline* | ||
callable to return str objects instead of bytes. | ||
""" | ||
def _gen(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that we are taking callables all of these can go :) |
||
while True: | ||
try: | ||
line = readline() | ||
except StopIteration: | ||
return | ||
if not line: | ||
return | ||
yield line.encode() | ||
return _tokenize(_gen(), 'utf-8') | ||
return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True) | ||
|
||
def main(): | ||
import argparse | ||
|
@@ -502,9 +489,9 @@ def error(message, filename=None, location=None): | |
tokens = list(tokenize(f.readline)) | ||
else: | ||
filename = "<stdin>" | ||
tokens = _tokenize( | ||
tokens = _generate_tokens_from_c_tokenizer( | ||
(x.encode('utf-8') for x in iter(sys.stdin.readline, "") | ||
), "utf-8") | ||
), "utf-8", extra_tokens=True) | ||
|
||
|
||
# Output the tokenization | ||
|
@@ -531,10 +518,13 @@ def error(message, filename=None, location=None): | |
perror("unexpected error: %s" % err) | ||
raise | ||
|
||
def _generate_tokens_from_c_tokenizer(source, extra_tokens=False): | ||
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False): | ||
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" | ||
import _tokenize as c_tokenizer | ||
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens): | ||
if encoding is None: | ||
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens) | ||
else: | ||
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens) | ||
for info in it: | ||
yield TokenInfo._make(info) | ||
|
||
|
||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change is because for some reason the inspect module is relying on the fact that if lines yielded by the generator do not end in
\n
then they are concatenated together, which is wrong because the contract says "should yield one line at a time" so if the line doesn't end in newline we add one always.