From 8ca29573a89f78f87380210d02f48410763f507d Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Fri, 26 May 2023 08:25:46 -0700 Subject: [PATCH] [3.12] gh-104972: Ensure that line attributes in tokens in the tokenize module are correct (GH-104975) (#104982) gh-104972: Ensure that line attributes in tokens in the tokenize module are correct (GH-104975) (cherry picked from commit 3fdb55c48291a459fb1e33edb5140ec0383222df) Co-authored-by: Pablo Galindo Salgado --- Lib/idlelib/idle_test/test_editor.py | 4 ++-- Lib/test/test_tokenize.py | 15 +++++++++++++-- ...2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst | 2 ++ Python/Python-tokenize.c | 9 ++++----- 4 files changed, 21 insertions(+), 9 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst diff --git a/Lib/idlelib/idle_test/test_editor.py b/Lib/idlelib/idle_test/test_editor.py index ba59c40dc6dde5..9296a6d235fbbe 100644 --- a/Lib/idlelib/idle_test/test_editor.py +++ b/Lib/idlelib/idle_test/test_editor.py @@ -201,8 +201,8 @@ def test_searcher(self): test_info = (# text, (block, indent)) ("", (None, None)), ("[1,", (None, None)), # TokenError - ("if 1:\n", ('if 1:', None)), - ("if 1:\n 2\n 3\n", ('if 1:', ' 2')), + ("if 1:\n", ('if 1:\n', None)), + ("if 1:\n 2\n 3\n", ('if 1:\n', ' 2\n')), ) for code, expected_pair in test_info: with self.subTest(code=code): diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 251ce2b864a9d8..0b7c25838d6782 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1174,7 +1174,7 @@ def readline(): # skip the initial encoding token and the end tokens tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2] - expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] + expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')] self.assertEqual(tokens, expected_tokens, "bytes not decoded with encoding") @@ -1657,7 +1657,6 @@ def check_roundtrip(self, f): code = f.encode('utf-8') else: code = f.read() - f.close() readline = iter(code.splitlines(keepends=True)).__next__ tokens5 = list(tokenize(readline)) tokens2 = [tok[:2] for tok in tokens5] @@ -1672,6 +1671,17 @@ def check_roundtrip(self, f): tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] self.assertEqual(tokens2_from5, tokens2) + def check_line_extraction(self, f): + if isinstance(f, str): + code = f.encode('utf-8') + else: + code = f.read() + readline = iter(code.splitlines(keepends=True)).__next__ + for tok in tokenize(readline): + if tok.type in {ENCODING, ENDMARKER}: + continue + self.assertEqual(tok.string, tok.line[tok.start[1]: tok.end[1]]) + def test_roundtrip(self): # There are some standard formatting practices that are easy to get right. @@ -1768,6 +1778,7 @@ def test_random_files(self): with open(testfile, 'rb') as f: # with self.subTest(file=testfile): self.check_roundtrip(f) + self.check_line_extraction(f) def roundtrip(self, code): diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst new file mode 100644 index 00000000000000..05d50c108c7b77 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-26-14-09-47.gh-issue-104972.El2UjE.rst @@ -0,0 +1,2 @@ +Ensure that the ``line`` attribute in :class:`tokenize.TokenInfo` objects in +the :mod:`tokenize` module are always correct. Patch by Pablo Galindo diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c index 0023e303b96e83..88087c12562413 100644 --- a/Python/Python-tokenize.c +++ b/Python/Python-tokenize.c @@ -194,15 +194,14 @@ tokenizeriter_next(tokenizeriterobject *it) goto exit; } - Py_ssize_t size = it->tok->inp - it->tok->buf; - assert(it->tok->buf[size-1] == '\n'); - size -= 1; // Remove the newline character from the end of the line - PyObject *line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace"); + const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; + Py_ssize_t size = it->tok->inp - line_start; + PyObject *line = PyUnicode_DecodeUTF8(line_start, size, "replace"); if (line == NULL) { Py_DECREF(str); goto exit; } - const char *line_start = ISSTRINGLIT(type) ? it->tok->multi_line_start : it->tok->line_start; + Py_ssize_t lineno = ISSTRINGLIT(type) ? it->tok->first_lineno : it->tok->lineno; Py_ssize_t end_lineno = it->tok->lineno; Py_ssize_t col_offset = -1;