Skip to content

Commit

Permalink
gh-105069: Add a readline-like callable to the tokenizer to consume i…
Browse files Browse the repository at this point in the history
…nput iteratively

Signed-off-by: Pablo Galindo <pablogsal@gmail.com>
  • Loading branch information
pablogsal committed May 29, 2023
1 parent 39f6a04 commit bc6d2da
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 73 deletions.
98 changes: 56 additions & 42 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ def check_tokenize(self, s, expected):
self.assertEqual(result,
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
expected.rstrip().splitlines())

def test_invalid_readline(self):
def gen():
yield "sdfosdg"
yield "sdfosdg"
with self.assertRaises(TypeError):
list(tokenize(gen().__next__))

def test_implicit_newline(self):
# Make sure that the tokenizer puts in an implicit NEWLINE
Expand Down Expand Up @@ -1154,7 +1161,8 @@ class TestTokenizerAdheresToPep0263(TestCase):

def _testFile(self, filename):
path = os.path.join(os.path.dirname(__file__), filename)
TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
with open(path, 'rb') as f:
TestRoundtrip.check_roundtrip(self, f)

def test_utf8_coding_cookie_and_no_utf8_bom(self):
f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
Expand Down Expand Up @@ -1827,9 +1835,10 @@ class CTokenizeTest(TestCase):
def check_tokenize(self, s, expected):
# Format the tokens in s in a table format.
# The ENDMARKER and final NEWLINE are omitted.
f = BytesIO(s.encode())
with self.subTest(source=s):
result = stringify_tokens_from_source(
_generate_tokens_from_c_tokenizer(s), s
_generate_tokens_from_c_tokenizer(f.readline), s
)
self.assertEqual(result, expected.rstrip().splitlines())

Expand Down Expand Up @@ -2668,43 +2677,44 @@ def test_unicode(self):

def test_invalid_syntax(self):
def get_tokens(string):
return list(_generate_tokens_from_c_tokenizer(string))

self.assertRaises(SyntaxError, get_tokens, "(1+2]")
self.assertRaises(SyntaxError, get_tokens, "(1+2}")
self.assertRaises(SyntaxError, get_tokens, "{1+2]")

self.assertRaises(SyntaxError, get_tokens, "1_")
self.assertRaises(SyntaxError, get_tokens, "1.2_")
self.assertRaises(SyntaxError, get_tokens, "1e2_")
self.assertRaises(SyntaxError, get_tokens, "1e+")

self.assertRaises(SyntaxError, get_tokens, "\xa0")
self.assertRaises(SyntaxError, get_tokens, "€")

self.assertRaises(SyntaxError, get_tokens, "0b12")
self.assertRaises(SyntaxError, get_tokens, "0b1_2")
self.assertRaises(SyntaxError, get_tokens, "0b2")
self.assertRaises(SyntaxError, get_tokens, "0b1_")
self.assertRaises(SyntaxError, get_tokens, "0b")
self.assertRaises(SyntaxError, get_tokens, "0o18")
self.assertRaises(SyntaxError, get_tokens, "0o1_8")
self.assertRaises(SyntaxError, get_tokens, "0o8")
self.assertRaises(SyntaxError, get_tokens, "0o1_")
self.assertRaises(SyntaxError, get_tokens, "0o")
self.assertRaises(SyntaxError, get_tokens, "0x1_")
self.assertRaises(SyntaxError, get_tokens, "0x")
self.assertRaises(SyntaxError, get_tokens, "1_")
self.assertRaises(SyntaxError, get_tokens, "012")
self.assertRaises(SyntaxError, get_tokens, "1.2_")
self.assertRaises(SyntaxError, get_tokens, "1e2_")
self.assertRaises(SyntaxError, get_tokens, "1e+")

self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")

self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
self.assertRaises(SyntaxError, get_tokens, "]")
the_string = BytesIO(string.encode())
return list(_generate_tokens_from_c_tokenizer(the_string.readline))

for case in [
"(1+2]",
"(1+2}",
"{1+2]",
"1_",
"1.2_",
"1e2_",
"1e+",

"\xa0",
"€",
"0b12",
"0b1_2",
"0b2",
"0b1_",
"0b",
"0o18",
"0o1_8",
"0o8",
"0o1_",
"0o",
"0x1_",
"0x",
"1_",
"012",
"1.2_",
"1e2_",
"1e+",
"'sdfsdf",
"'''sdfsdf''",
"("*1000+"a"+")"*1000,
"]",
]:
with self.subTest(case=case):
self.assertRaises(SyntaxError, get_tokens, case)

def test_max_indent(self):
MAXINDENT = 100
Expand All @@ -2715,20 +2725,24 @@ def generate_source(indents):
return source

valid = generate_source(MAXINDENT - 1)
tokens = list(_generate_tokens_from_c_tokenizer(valid))
the_input = BytesIO(valid.encode())
tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline))
self.assertEqual(tokens[-2].type, DEDENT)
self.assertEqual(tokens[-1].type, ENDMARKER)
compile(valid, "<string>", "exec")

invalid = generate_source(MAXINDENT)
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
the_input = BytesIO(invalid.encode())
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
self.assertRaises(
IndentationError, compile, invalid, "<string>", "exec"
)

def test_continuation_lines_indentation(self):
def get_tokens(string):
return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
the_string = BytesIO(string.encode())
return [(kind, string) for (kind, string, *_)
in _generate_tokens_from_c_tokenizer(the_string.readline)]

code = dedent("""
def fib(n):
Expand Down
9 changes: 5 additions & 4 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,9 @@ def tokenize(readline):
yield from _tokenize(rl_gen, encoding)

def _tokenize(rl_gen, encoding):
source = b"".join(rl_gen).decode(encoding)
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
for token in _generate_tokens_from_c_tokenizer(rl_gen.__next__,
encoding=encoding,
extra_tokens=True):
yield token

def generate_tokens(readline):
Expand Down Expand Up @@ -531,10 +532,10 @@ def error(message, filename=None, location=None):
perror("unexpected error: %s" % err)
raise

def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
def _generate_tokens_from_c_tokenizer(source, encoding="utf-8", extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
import _tokenize as c_tokenizer
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
for info in c_tokenizer.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens):
yield TokenInfo._make(info)


Expand Down
105 changes: 101 additions & 4 deletions Parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ tok_new(void)
tok->filename = NULL;
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
tok->readline = NULL;
tok->type_comments = 0;
tok->async_hacks = 0;
tok->async_def = 0;
Expand Down Expand Up @@ -900,6 +901,33 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
return tok;
}

struct tok_state *
_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
int exec_input, int preserve_crlf)
{
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
_PyTokenizer_Free(tok);
return NULL;
}
tok->cur = tok->inp = tok->buf;
tok->end = tok->buf + BUFSIZ;
tok->fp = NULL;
if (enc != NULL) {
tok->encoding = new_string(enc, strlen(enc), tok);
if (!tok->encoding) {
_PyTokenizer_Free(tok);
return NULL;
}
}
tok->decoding_state = STATE_NORMAL;
Py_INCREF(readline);
tok->readline = readline;
return tok;
}

/* Set up tokenizer for UTF-8 string */

struct tok_state *
Expand Down Expand Up @@ -969,8 +997,9 @@ _PyTokenizer_Free(struct tok_state *tok)
}
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
Py_XDECREF(tok->readline);
Py_XDECREF(tok->filename);
if (tok->fp != NULL && tok->buf != NULL) {
if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
PyMem_Free(tok->buf);
}
if (tok->input) {
Expand Down Expand Up @@ -1021,6 +1050,66 @@ tok_readline_raw(struct tok_state *tok)
return 1;
}

static int
tok_readline_string(struct tok_state* tok) {
PyObject* line = NULL;
PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
if (raw_line == NULL) {
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
PyErr_Clear();
return 1;
}
error_ret(tok);
goto error;
}
if(tok->encoding != NULL) {
if (!PyBytes_Check(raw_line)) {
PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
error_ret(tok);
goto error;
}
line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
tok->encoding, "replace");
Py_CLEAR(raw_line);
if (line == NULL) {
error_ret(tok);
goto error;
}
} else {
line = raw_line;
raw_line = NULL;
}
Py_ssize_t buflen;
const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
if (buf == NULL) {
error_ret(tok);
goto error;
}

// Make room for the null terminator *and* potentially
// an extra newline character that we may need to artificially
// add.
size_t buffer_size = buflen + 2;
if (!tok_reserve_buf(tok, buffer_size)) {
goto error;
}
memcpy(tok->inp, buf, buflen);
tok->inp += buflen;
*tok->inp = '\0';

if (tok->start == NULL) {
tok->buf = tok->cur;
}
tok->line_start = tok->cur;

Py_DECREF(line);
return 1;
error:
Py_XDECREF(raw_line);
Py_XDECREF(line);
return 0;
}

static int
tok_underflow_string(struct tok_state *tok) {
char *end = strchr(tok->inp, '\n');
Expand Down Expand Up @@ -1136,7 +1225,7 @@ tok_underflow_interactive(struct tok_state *tok) {
}

static int
tok_underflow_file(struct tok_state *tok) {
tok_underflow_file(struct tok_state *tok, int use_readline) {
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
tok->cur = tok->inp = tok->buf;
}
Expand All @@ -1157,6 +1246,11 @@ tok_underflow_file(struct tok_state *tok) {
return 0;
}
}
else if(use_readline) {
if (!tok_readline_string(tok)) {
return 0;
}
}
else {
/* We want a 'raw' read. */
if (!tok_readline_raw(tok)) {
Expand Down Expand Up @@ -1238,14 +1332,17 @@ tok_nextc(struct tok_state *tok)
if (tok->done != E_OK) {
return EOF;
}
if (tok->fp == NULL) {
if (tok->readline) {
rc = tok_underflow_file(tok, 1);
}
else if (tok->fp == NULL) {
rc = tok_underflow_string(tok);
}
else if (tok->prompt != NULL) {
rc = tok_underflow_interactive(tok);
}
else {
rc = tok_underflow_file(tok);
rc = tok_underflow_file(tok, 0);
}
#if defined(Py_DEBUG)
if (tok->debug) {
Expand Down
2 changes: 2 additions & 0 deletions Parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ struct tok_state {
expression (cf. issue 16806) */
PyObject *decoding_readline; /* open(...).readline */
PyObject *decoding_buffer;
PyObject *readline; /* readline() function */
const char* enc; /* Encoding for the current str. */
char* str; /* Source string being tokenized (if tokenizing from a string)*/
char* input; /* Tokenizer's newline translated copy of the string. */
Expand Down Expand Up @@ -137,6 +138,7 @@ struct tok_state {

extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
const char *, const char *);
extern void _PyTokenizer_Free(struct tok_state *);
Expand Down
11 changes: 6 additions & 5 deletions Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,16 @@ typedef struct
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
source: str
readline: object
*
extra_tokens: bool
encoding: str = NULL
[clinic start generated code]*/

static PyObject *
tokenizeriter_new_impl(PyTypeObject *type, const char *source,
int extra_tokens)
/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
int extra_tokens, const char *encoding)
/*[clinic end generated code: output=7501a1211683ce16 input=92c429aa8f2e6714]*/
{
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
if (self == NULL) {
Expand All @@ -55,7 +56,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
if (filename == NULL) {
return NULL;
}
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
if (self->tok == NULL) {
Py_DECREF(filename);
return NULL;
Expand Down
Loading

0 comments on commit bc6d2da

Please sign in to comment.