From 1b8ff7e86a6625aff4651b00b9fe77b53bf705b0 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 29 May 2023 20:13:45 +0100 Subject: [PATCH] gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively Signed-off-by: Pablo Galindo --- Lib/test/test_tokenize.py | 139 +++++++++++++++++++----------- Lib/tokenize.py | 32 +++---- Parser/tokenizer.c | 110 ++++++++++++++++++++++- Parser/tokenizer.h | 2 + Python/Python-tokenize.c | 11 +-- Python/clinic/Python-tokenize.c.h | 43 +++++---- 6 files changed, 240 insertions(+), 97 deletions(-) diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index cd11dddd0fe51ab..1669d98b1035d92 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1,6 +1,6 @@ from test import support from test.support import os_helper -from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, +from tokenize import (tokenize, untokenize, NUMBER, NAME, OP, STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, open as tokenize_open, Untokenizer, generate_tokens, NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo) @@ -51,6 +51,19 @@ def check_tokenize(self, s, expected): [" ENCODING 'utf-8' (0, 0) (0, 0)"] + expected.rstrip().splitlines()) + def test_invalid_readline(self): + def gen(): + yield "sdfosdg" + yield "sdfosdg" + with self.assertRaises(TypeError): + list(tokenize(gen().__next__)) + + def gen(): + yield b"sdfosdg" + yield b"sdfosdg" + with self.assertRaises(TypeError): + list(generate_tokens(gen().__next__)) + def test_implicit_newline(self): # Make sure that the tokenizer puts in an implicit NEWLINE # when the input lacks a trailing new line. @@ -1154,7 +1167,8 @@ class TestTokenizerAdheresToPep0263(TestCase): def _testFile(self, filename): path = os.path.join(os.path.dirname(__file__), filename) - TestRoundtrip.check_roundtrip(self, open(path, 'rb')) + with open(path, 'rb') as f: + TestRoundtrip.check_roundtrip(self, f) def test_utf8_coding_cookie_and_no_utf8_bom(self): f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt' @@ -1199,7 +1213,8 @@ def readline(): yield b'' # skip the initial encoding token and the end tokens - tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2] + tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8', + extra_tokens=True))[:-2] expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -1468,13 +1483,13 @@ def test_tokenize(self): def mock_detect_encoding(readline): return encoding, [b'first', b'second'] - def mock__tokenize(readline, encoding): + def mock__tokenize(readline, encoding, **kwargs): nonlocal encoding_used encoding_used = encoding out = [] while True: try: - next_line = next(readline) + next_line = readline() except StopIteration: return out if next_line: @@ -1491,16 +1506,16 @@ def mock_readline(): return str(counter).encode() orig_detect_encoding = tokenize_module.detect_encoding - orig__tokenize = tokenize_module._tokenize + orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer tokenize_module.detect_encoding = mock_detect_encoding - tokenize_module._tokenize = mock__tokenize + tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize try: results = tokenize(mock_readline) self.assertEqual(list(results)[1:], [b'first', b'second', b'1', b'2', b'3', b'4']) finally: tokenize_module.detect_encoding = orig_detect_encoding - tokenize_module._tokenize = orig__tokenize + tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token self.assertEqual(encoding_used, encoding) @@ -1827,12 +1842,33 @@ class CTokenizeTest(TestCase): def check_tokenize(self, s, expected): # Format the tokens in s in a table format. # The ENDMARKER and final NEWLINE are omitted. + f = StringIO(s) with self.subTest(source=s): result = stringify_tokens_from_source( - _generate_tokens_from_c_tokenizer(s), s + _generate_tokens_from_c_tokenizer(f.readline), s ) self.assertEqual(result, expected.rstrip().splitlines()) + def test_encoding(self): + def readline(encoding): + yield "1+1".encode(encoding) + + expected = [ + TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'), + TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'), + TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'), + TokenInfo(type=NEWLINE, string='\n', start=(1, 3), end=(1, 4), line='1+1\n'), + TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='') + ] + for encoding in ["utf-8", "latin-1", "utf-16"]: + with self.subTest(encoding=encoding): + tokens = list(_generate_tokens_from_c_tokenizer( + readline(encoding).__next__, + extra_tokens=True, + encoding=encoding, + )) + self.assertEqual(tokens, expected) + def test_int(self): self.check_tokenize('0xff <= 255', """\ @@ -2668,43 +2704,44 @@ def test_unicode(self): def test_invalid_syntax(self): def get_tokens(string): - return list(_generate_tokens_from_c_tokenizer(string)) - - self.assertRaises(SyntaxError, get_tokens, "(1+2]") - self.assertRaises(SyntaxError, get_tokens, "(1+2}") - self.assertRaises(SyntaxError, get_tokens, "{1+2]") - - self.assertRaises(SyntaxError, get_tokens, "1_") - self.assertRaises(SyntaxError, get_tokens, "1.2_") - self.assertRaises(SyntaxError, get_tokens, "1e2_") - self.assertRaises(SyntaxError, get_tokens, "1e+") - - self.assertRaises(SyntaxError, get_tokens, "\xa0") - self.assertRaises(SyntaxError, get_tokens, "€") - - self.assertRaises(SyntaxError, get_tokens, "0b12") - self.assertRaises(SyntaxError, get_tokens, "0b1_2") - self.assertRaises(SyntaxError, get_tokens, "0b2") - self.assertRaises(SyntaxError, get_tokens, "0b1_") - self.assertRaises(SyntaxError, get_tokens, "0b") - self.assertRaises(SyntaxError, get_tokens, "0o18") - self.assertRaises(SyntaxError, get_tokens, "0o1_8") - self.assertRaises(SyntaxError, get_tokens, "0o8") - self.assertRaises(SyntaxError, get_tokens, "0o1_") - self.assertRaises(SyntaxError, get_tokens, "0o") - self.assertRaises(SyntaxError, get_tokens, "0x1_") - self.assertRaises(SyntaxError, get_tokens, "0x") - self.assertRaises(SyntaxError, get_tokens, "1_") - self.assertRaises(SyntaxError, get_tokens, "012") - self.assertRaises(SyntaxError, get_tokens, "1.2_") - self.assertRaises(SyntaxError, get_tokens, "1e2_") - self.assertRaises(SyntaxError, get_tokens, "1e+") - - self.assertRaises(SyntaxError, get_tokens, "'sdfsdf") - self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''") - - self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000) - self.assertRaises(SyntaxError, get_tokens, "]") + the_string = StringIO(string) + return list(_generate_tokens_from_c_tokenizer(the_string.readline)) + + for case in [ + "(1+2]", + "(1+2}", + "{1+2]", + "1_", + "1.2_", + "1e2_", + "1e+", + + "\xa0", + "€", + "0b12", + "0b1_2", + "0b2", + "0b1_", + "0b", + "0o18", + "0o1_8", + "0o8", + "0o1_", + "0o", + "0x1_", + "0x", + "1_", + "012", + "1.2_", + "1e2_", + "1e+", + "'sdfsdf", + "'''sdfsdf''", + "("*1000+"a"+")"*1000, + "]", + ]: + with self.subTest(case=case): + self.assertRaises(SyntaxError, get_tokens, case) def test_max_indent(self): MAXINDENT = 100 @@ -2715,20 +2752,24 @@ def generate_source(indents): return source valid = generate_source(MAXINDENT - 1) - tokens = list(_generate_tokens_from_c_tokenizer(valid)) + the_input = StringIO(valid) + tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline)) self.assertEqual(tokens[-2].type, DEDENT) self.assertEqual(tokens[-1].type, ENDMARKER) compile(valid, "", "exec") invalid = generate_source(MAXINDENT) - self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid))) + the_input = StringIO(invalid) + self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline))) self.assertRaises( IndentationError, compile, invalid, "", "exec" ) def test_continuation_lines_indentation(self): def get_tokens(string): - return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)] + the_string = StringIO(string) + return [(kind, string) for (kind, string, *_) + in _generate_tokens_from_c_tokenizer(the_string.readline)] code = dedent(""" def fib(n): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 4895e94d1dfda7b..f0b9845aea0ffc3 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -34,6 +34,7 @@ import sys from token import * from token import EXACT_TOKEN_TYPES +import _tokenize as c_tokenizer cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) @@ -443,12 +444,7 @@ def tokenize(readline): # BOM will already have been stripped. encoding = "utf-8" yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') - yield from _tokenize(rl_gen, encoding) - -def _tokenize(rl_gen, encoding): - source = b"".join(rl_gen).decode(encoding) - for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True): - yield token + yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True) def generate_tokens(readline): """Tokenize a source reading Python code as unicode strings. @@ -456,16 +452,7 @@ def generate_tokens(readline): This has the same API as tokenize(), except that it expects the *readline* callable to return str objects instead of bytes. """ - def _gen(): - while True: - try: - line = readline() - except StopIteration: - return - if not line: - return - yield line.encode() - return _tokenize(_gen(), 'utf-8') + return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True) def main(): import argparse @@ -502,9 +489,9 @@ def error(message, filename=None, location=None): tokens = list(tokenize(f.readline)) else: filename = "" - tokens = _tokenize( + tokens = _generate_tokens_from_c_tokenizer( (x.encode('utf-8') for x in iter(sys.stdin.readline, "") - ), "utf-8") + ), "utf-8", extra_tokens=True) # Output the tokenization @@ -531,10 +518,13 @@ def error(message, filename=None, location=None): perror("unexpected error: %s" % err) raise -def _generate_tokens_from_c_tokenizer(source, extra_tokens=False): +def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False): """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" - import _tokenize as c_tokenizer - for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens): + if encoding is None: + it = c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens) + else: + it = c_tokenizer.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens) + for info in it: yield TokenInfo._make(info) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 59c817293fbfcd9..9848d33c66cdeb5 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -103,6 +103,7 @@ tok_new(void) tok->filename = NULL; tok->decoding_readline = NULL; tok->decoding_buffer = NULL; + tok->readline = NULL; tok->type_comments = 0; tok->async_hacks = 0; tok->async_def = 0; @@ -900,6 +901,33 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf) return tok; } +struct tok_state * +_PyTokenizer_FromReadline(PyObject* readline, const char* enc, + int exec_input, int preserve_crlf) +{ + struct tok_state *tok = tok_new(); + if (tok == NULL) + return NULL; + if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { + _PyTokenizer_Free(tok); + return NULL; + } + tok->cur = tok->inp = tok->buf; + tok->end = tok->buf + BUFSIZ; + tok->fp = NULL; + if (enc != NULL) { + tok->encoding = new_string(enc, strlen(enc), tok); + if (!tok->encoding) { + _PyTokenizer_Free(tok); + return NULL; + } + } + tok->decoding_state = STATE_NORMAL; + Py_INCREF(readline); + tok->readline = readline; + return tok; +} + /* Set up tokenizer for UTF-8 string */ struct tok_state * @@ -969,8 +997,9 @@ _PyTokenizer_Free(struct tok_state *tok) } Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); + Py_XDECREF(tok->readline); Py_XDECREF(tok->filename); - if (tok->fp != NULL && tok->buf != NULL) { + if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) { PyMem_Free(tok->buf); } if (tok->input) { @@ -1021,6 +1050,71 @@ tok_readline_raw(struct tok_state *tok) return 1; } +static int +tok_readline_string(struct tok_state* tok) { + PyObject* line = NULL; + PyObject* raw_line = PyObject_CallNoArgs(tok->readline); + if (raw_line == NULL) { + if (PyErr_ExceptionMatches(PyExc_StopIteration)) { + PyErr_Clear(); + return 1; + } + error_ret(tok); + goto error; + } + if(tok->encoding != NULL) { + if (!PyBytes_Check(raw_line)) { + PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object"); + error_ret(tok); + goto error; + } + line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line), + tok->encoding, "replace"); + Py_CLEAR(raw_line); + if (line == NULL) { + error_ret(tok); + goto error; + } + } else { + if(!PyUnicode_Check(raw_line)) { + PyErr_Format(PyExc_TypeError, "readline() returned a non-string object"); + error_ret(tok); + goto error; + } + line = raw_line; + raw_line = NULL; + } + Py_ssize_t buflen; + const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen); + if (buf == NULL) { + error_ret(tok); + goto error; + } + + // Make room for the null terminator *and* potentially + // an extra newline character that we may need to artificially + // add. + size_t buffer_size = buflen + 2; + if (!tok_reserve_buf(tok, buffer_size)) { + goto error; + } + memcpy(tok->inp, buf, buflen); + tok->inp += buflen; + *tok->inp = '\0'; + + if (tok->start == NULL) { + tok->buf = tok->cur; + } + tok->line_start = tok->cur; + + Py_DECREF(line); + return 1; +error: + Py_XDECREF(raw_line); + Py_XDECREF(line); + return 0; +} + static int tok_underflow_string(struct tok_state *tok) { char *end = strchr(tok->inp, '\n'); @@ -1136,7 +1230,7 @@ tok_underflow_interactive(struct tok_state *tok) { } static int -tok_underflow_file(struct tok_state *tok) { +tok_underflow_file(struct tok_state *tok, int use_readline) { if (tok->start == NULL && !INSIDE_FSTRING(tok)) { tok->cur = tok->inp = tok->buf; } @@ -1157,6 +1251,11 @@ tok_underflow_file(struct tok_state *tok) { return 0; } } + else if(use_readline) { + if (!tok_readline_string(tok)) { + return 0; + } + } else { /* We want a 'raw' read. */ if (!tok_readline_raw(tok)) { @@ -1238,14 +1337,17 @@ tok_nextc(struct tok_state *tok) if (tok->done != E_OK) { return EOF; } - if (tok->fp == NULL) { + if (tok->readline) { + rc = tok_underflow_file(tok, 1); + } + else if (tok->fp == NULL) { rc = tok_underflow_string(tok); } else if (tok->prompt != NULL) { rc = tok_underflow_interactive(tok); } else { - rc = tok_underflow_file(tok); + rc = tok_underflow_file(tok, 0); } #if defined(Py_DEBUG) if (tok->debug) { diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 02749e355da8124..600d4297b6865af 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -109,6 +109,7 @@ struct tok_state { expression (cf. issue 16806) */ PyObject *decoding_readline; /* open(...).readline */ PyObject *decoding_buffer; + PyObject *readline; /* readline() function */ const char* enc; /* Encoding for the current str. */ char* str; /* Source string being tokenized (if tokenizing from a string)*/ char* input; /* Tokenizer's newline translated copy of the string. */ @@ -137,6 +138,7 @@ struct tok_state { extern struct tok_state *_PyTokenizer_FromString(const char *, int, int); extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int); +extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int); extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*, const char *, const char *); extern void _PyTokenizer_Free(struct tok_state *); diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 4eced66b6177085..3356317b58db682 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -37,15 +37,16 @@ typedef struct @classmethod _tokenizer.tokenizeriter.__new__ as tokenizeriter_new - source: str + readline: object * extra_tokens: bool + encoding: str = NULL [clinic start generated code]*/ static PyObject * -tokenizeriter_new_impl(PyTypeObject *type, const char *source, - int extra_tokens) -/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/ +tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline, + int extra_tokens, const char *encoding) +/*[clinic end generated code: output=7501a1211683ce16 input=92c429aa8f2e6714]*/ { tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0); if (self == NULL) { @@ -55,7 +56,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source, if (filename == NULL) { return NULL; } - self->tok = _PyTokenizer_FromUTF8(source, 1, 1); + self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1); if (self->tok == NULL) { Py_DECREF(filename); return NULL; diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h index 7e779388a92dbf3..a083f22cac5f744 100644 --- a/Python/clinic/Python-tokenize.c.h +++ b/Python/clinic/Python-tokenize.c.h @@ -9,8 +9,8 @@ preserve static PyObject * -tokenizeriter_new_impl(PyTypeObject *type, const char *source, - int extra_tokens); +tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline, + int extra_tokens, const char *encoding); static PyObject * tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) @@ -18,14 +18,14 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) PyObject *return_value = NULL; #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) - #define NUM_KEYWORDS 2 + #define NUM_KEYWORDS 3 static struct { PyGC_Head _this_is_not_used; PyObject_VAR_HEAD PyObject *ob_item[NUM_KEYWORDS]; } _kwtuple = { .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) - .ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), }, + .ob_item = { &_Py_ID(readline), &_Py_ID(extra_tokens), &_Py_ID(encoding), }, }; #undef NUM_KEYWORDS #define KWTUPLE (&_kwtuple.ob_base.ob_base) @@ -34,43 +34,50 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) # define KWTUPLE NULL #endif // !Py_BUILD_CORE - static const char * const _keywords[] = {"source", "extra_tokens", NULL}; + static const char * const _keywords[] = {"readline", "extra_tokens", "encoding", NULL}; static _PyArg_Parser _parser = { .keywords = _keywords, .fname = "tokenizeriter", .kwtuple = KWTUPLE, }; #undef KWTUPLE - PyObject *argsbuf[2]; + PyObject *argsbuf[3]; PyObject * const *fastargs; Py_ssize_t nargs = PyTuple_GET_SIZE(args); - const char *source; + Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 2; + PyObject *readline; int extra_tokens; + const char *encoding = NULL; fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf); if (!fastargs) { goto exit; } - if (!PyUnicode_Check(fastargs[0])) { - _PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]); + readline = fastargs[0]; + extra_tokens = PyObject_IsTrue(fastargs[1]); + if (extra_tokens < 0) { goto exit; } - Py_ssize_t source_length; - source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length); - if (source == NULL) { + if (!noptargs) { + goto skip_optional_kwonly; + } + if (!PyUnicode_Check(fastargs[2])) { + _PyArg_BadArgument("tokenizeriter", "argument 'encoding'", "str", fastargs[2]); goto exit; } - if (strlen(source) != (size_t)source_length) { - PyErr_SetString(PyExc_ValueError, "embedded null character"); + Py_ssize_t encoding_length; + encoding = PyUnicode_AsUTF8AndSize(fastargs[2], &encoding_length); + if (encoding == NULL) { goto exit; } - extra_tokens = PyObject_IsTrue(fastargs[1]); - if (extra_tokens < 0) { + if (strlen(encoding) != (size_t)encoding_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); goto exit; } - return_value = tokenizeriter_new_impl(type, source, extra_tokens); +skip_optional_kwonly: + return_value = tokenizeriter_new_impl(type, readline, extra_tokens, encoding); exit: return return_value; } -/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=293f2b270ef7a7e1 input=a9049054013a1b77]*/