From 9d72066ab1c538ffa5eed74cbaa67385d5491324 Mon Sep 17 00:00:00 2001 From: jx124 <64946984+jx124@users.noreply.github.com> Date: Tue, 2 May 2023 00:31:36 +0800 Subject: [PATCH 1/8] Fixed off by 1 error in f string tokenizer --- Parser/tokenizer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 8fb9be7bfd0182..889c046d90f4eb 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -43,12 +43,12 @@ #ifdef Py_DEBUG static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index >= 0); - assert(tok->tok_mode_stack_index < MAXLEVEL); + assert(tok->tok_mode_stack_index + 1 < MAXLEVEL); return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); } static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index >= 0); - assert(tok->tok_mode_stack_index < MAXLEVEL); + assert(tok->tok_mode_stack_index + 1 < MAXLEVEL); return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); } #else @@ -2413,7 +2413,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t case '(': case '[': case '{': - if (tok->level >= MAXLEVEL) { + if (tok->level >= MAXLEVEL || tok->tok_mode_stack_index + 1 >= MAXLEVEL) { return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses")); } tok->parenstack[tok->level] = c; From ab38fd2471c5fcf45fb5bc812a797f3dd01e7c59 Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Mon, 1 May 2023 16:35:03 +0000 Subject: [PATCH 2/8] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20blu?= =?UTF-8?q?rb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst new file mode 100644 index 00000000000000..5e721564f0cb42 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst @@ -0,0 +1 @@ +Fixed off-by-1 error in f-string tokenizer. From eb4b729b35dcae8fe0c46906e181159b65d73f31 Mon Sep 17 00:00:00 2001 From: jx124 <64946984+jx124@users.noreply.github.com> Date: Tue, 2 May 2023 00:46:43 +0800 Subject: [PATCH 3/8] Update test_fstring.py Co-Authored-By: sunmy2019 <59365878+sunmy2019@users.noreply.github.com> Co-Authored-By: Ken Jin --- Lib/test/test_fstring.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 5e94c99ae65af1..5925f098f93bcd 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1548,6 +1548,10 @@ def test_syntax_error_after_debug(self): "f'{1=}{1;'", "f'{1=}{1;}'", ]) + + def test_nested_fstring_max_stack_level(self): + with self.assertRaises(SyntaxError): + compile('f"{1 1:' + ('{f"1:' * 199), "?", "exec") if __name__ == '__main__': unittest.main() From d5f4a3d8079a8ccce77857779eb16e34fe83189f Mon Sep 17 00:00:00 2001 From: jx124 <64946984+jx124@users.noreply.github.com> Date: Tue, 2 May 2023 00:58:57 +0800 Subject: [PATCH 4/8] Fixed tabs and assert --- Lib/test/test_fstring.py | 8 +- Parser/tokenizer.c | 1597 ++++++++++++++++++++++++-------------- 2 files changed, 1013 insertions(+), 592 deletions(-) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 5925f098f93bcd..81a64ce1becf4f 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1548,10 +1548,10 @@ def test_syntax_error_after_debug(self): "f'{1=}{1;'", "f'{1=}{1;}'", ]) - - def test_nested_fstring_max_stack_level(self): - with self.assertRaises(SyntaxError): - compile('f"{1 1:' + ('{f"1:' * 199), "?", "exec") + + def test_nested_fstring_max_stack_level(self): + with self.assertRaises(SyntaxError): + compile('f"{1 1:' + ('{f"1:' * 199), "?", "exec") if __name__ == '__main__': unittest.main() diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 889c046d90f4eb..6a10bf9c2cad5f 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -3,7 +3,7 @@ #define PY_SSIZE_T_CLEAN #include "Python.h" -#include "pycore_call.h" // _PyObject_CallNoArgs() +#include "pycore_call.h" // _PyObject_CallNoArgs() #include #include @@ -14,39 +14,33 @@ /* Alternate tab spacing */ #define ALTTABSIZE 1 -#define is_potential_identifier_start(c) (\ - (c >= 'a' && c <= 'z')\ - || (c >= 'A' && c <= 'Z')\ - || c == '_'\ - || (c >= 128)) - -#define is_potential_identifier_char(c) (\ - (c >= 'a' && c <= 'z')\ - || (c >= 'A' && c <= 'Z')\ - || (c >= '0' && c <= '9')\ - || c == '_'\ - || (c >= 128)) +#define is_potential_identifier_start(c) ( \ + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c >= 128)) +#define is_potential_identifier_char(c) ( \ + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || (c >= 128)) /* Don't ever change this -- it would break the portability of Python code */ #define TABSIZE 8 #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) -#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ - type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) +#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) ( \ + type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) #define ADVANCE_LINENO() \ - tok->lineno++; \ - tok->col_offset = 0; + tok->lineno++; \ + tok->col_offset = 0; #define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0) #define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0) #ifdef Py_DEBUG -static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { +static inline tokenizer_mode *TOK_GET_MODE(struct tok_state *tok) +{ assert(tok->tok_mode_stack_index >= 0); - assert(tok->tok_mode_stack_index + 1 < MAXLEVEL); + assert(tok->tok_mode_stack_index < MAXLEVEL); return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); } -static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { +static inline tokenizer_mode *TOK_NEXT_MODE(struct tok_state *tok) +{ assert(tok->tok_mode_stack_index >= 0); assert(tok->tok_mode_stack_index + 1 < MAXLEVEL); return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); @@ -64,7 +58,7 @@ static int syntaxerror(struct tok_state *tok, const char *format, ...); /* Spaces in this constant are treated as "zero or more spaces or tabs" when tokenizing. */ -static const char* type_comment_prefix = "# type: "; +static const char *type_comment_prefix = "# type: "; /* Create and initialize a new tok_state structure */ @@ -72,7 +66,7 @@ static struct tok_state * tok_new(void) { struct tok_state *tok = (struct tok_state *)PyMem_Malloc( - sizeof(struct tok_state)); + sizeof(struct tok_state)); if (tok == NULL) return NULL; tok->buf = tok->cur = tok->inp = NULL; @@ -111,7 +105,7 @@ tok_new(void) tok->interactive_underflow = IUNDERFLOW_NORMAL; tok->str = NULL; tok->report_warnings = 1; - tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0}; + tok->tok_mode_stack[0] = (tokenizer_mode){.kind = TOK_REGULAR_MODE, .f_string_quote = '\0', .f_string_quote_size = 0, .f_string_debug = 0}; tok->tok_mode_stack_index = 0; tok->tok_report_warnings = 1; #ifdef Py_DEBUG @@ -123,8 +117,9 @@ tok_new(void) static char * new_string(const char *s, Py_ssize_t len, struct tok_state *tok) { - char* result = (char *)PyMem_Malloc(len + 1); - if (!result) { + char *result = (char *)PyMem_Malloc(len + 1); + if (!result) + { tok->done = E_NOMEM; return NULL; } @@ -143,16 +138,16 @@ error_ret(struct tok_state *tok) /* XXX */ tok->start = NULL; tok->end = NULL; tok->done = E_DECODE; - return NULL; /* as if it were EOF */ + return NULL; /* as if it were EOF */ } - static const char * -get_normal_name(const char *s) /* for utf-8 and latin-1 */ +get_normal_name(const char *s) /* for utf-8 and latin-1 */ { char buf[13]; int i; - for (i = 0; i < 12; i++) { + for (i = 0; i < 12; i++) + { int c = s[i]; if (c == '\0') break; @@ -185,20 +180,24 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t *spec = NULL; /* Coding spec must be in a comment, and that comment must be * the only statement on the source code line. */ - for (i = 0; i < size - 6; i++) { + for (i = 0; i < size - 6; i++) + { if (s[i] == '#') break; if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') return 1; } - for (; i < size - 6; i++) { /* XXX inefficient search */ - const char* t = s + i; - if (memcmp(t, "coding", 6) == 0) { - const char* begin = NULL; + for (; i < size - 6; i++) + { /* XXX inefficient search */ + const char *t = s + i; + if (memcmp(t, "coding", 6) == 0) + { + const char *begin = NULL; t += 6; if (t[0] != ':' && t[0] != '=') continue; - do { + do + { t++; } while (t[0] == ' ' || t[0] == '\t'); @@ -207,13 +206,15 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t t[0] == '-' || t[0] == '_' || t[0] == '.') t++; - if (begin < t) { - char* r = new_string(begin, t - begin, tok); - const char* q; + if (begin < t) + { + char *r = new_string(begin, t - begin, tok); + const char *q; if (!r) return 0; q = get_normal_name(r); - if (r != q) { + if (r != q) + { PyMem_Free(r); r = new_string(q, strlen(q), tok); if (!r) @@ -233,24 +234,29 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t Return 1 on success, 0 on failure. */ static int -check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, +check_coding_spec(const char *line, Py_ssize_t size, struct tok_state *tok, int set_readline(struct tok_state *, const char *)) { char *cs; - if (tok->cont_line) { + if (tok->cont_line) + { /* It's a continuation line, so it can't be a coding spec. */ tok->decoding_state = STATE_NORMAL; return 1; } - if (!get_coding_spec(line, &cs, size, tok)) { + if (!get_coding_spec(line, &cs, size, tok)) + { return 0; } - if (!cs) { + if (!cs) + { Py_ssize_t i; - for (i = 0; i < size; i++) { + for (i = 0; i < size; i++) + { if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') break; - if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { + if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') + { /* Stop checking coding spec after a line containing * anything except a comment. */ tok->decoding_state = STATE_NORMAL; @@ -260,17 +266,22 @@ check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, return 1; } tok->decoding_state = STATE_NORMAL; - if (tok->encoding == NULL) { + if (tok->encoding == NULL) + { assert(tok->decoding_readline == NULL); - if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { + if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) + { error_ret(tok); PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); PyMem_Free(cs); return 0; } tok->encoding = cs; - } else { /* then, compare cs with BOM */ - if (strcmp(tok->encoding, cs) != 0) { + } + else + { /* then, compare cs with BOM */ + if (strcmp(tok->encoding, cs) != 0) + { error_ret(tok); PyErr_Format(PyExc_SyntaxError, "encoding problem: %s with BOM", cs); @@ -295,23 +306,30 @@ check_bom(int get_char(struct tok_state *), int ch1, ch2, ch3; ch1 = get_char(tok); tok->decoding_state = STATE_SEEK_CODING; - if (ch1 == EOF) { + if (ch1 == EOF) + { return 1; - } else if (ch1 == 0xEF) { + } + else if (ch1 == 0xEF) + { ch2 = get_char(tok); - if (ch2 != 0xBB) { + if (ch2 != 0xBB) + { unget_char(ch2, tok); unget_char(ch1, tok); return 1; } ch3 = get_char(tok); - if (ch3 != 0xBF) { + if (ch3 != 0xBF) + { unget_char(ch3, tok); unget_char(ch2, tok); unget_char(ch1, tok); return 1; } - } else { + } + else + { unget_char(ch1, tok); return 1; } @@ -325,24 +343,29 @@ check_bom(int get_char(struct tok_state *), } static int -tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { +tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) +{ assert(tok->fp_interactive); - if (!line) { + if (!line) + { return 0; } Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; Py_ssize_t line_size = strlen(line); char last_char = line[line_size > 0 ? line_size - 1 : line_size]; - if (last_char != '\n') { + if (last_char != '\n') + { line_size += 1; } - char* new_str = tok->interactive_src_start; + char *new_str = tok->interactive_src_start; new_str = PyMem_Realloc(new_str, current_size + line_size + 1); - if (!new_str) { - if (tok->interactive_src_start) { + if (!new_str) + { + if (tok->interactive_src_start) + { PyMem_Free(tok->interactive_src_start); } tok->interactive_src_start = NULL; @@ -351,7 +374,8 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { return -1; } strcpy(new_str + current_size, line); - if (last_char != '\n') { + if (last_char != '\n') + { /* Last line does not end in \n, fake one */ new_str[current_size + line_size - 1] = '\n'; new_str[current_size + line_size] = '\0'; @@ -369,7 +393,8 @@ remember_fstring_buffers(struct tok_state *tok) int index; tokenizer_mode *mode; - for (index = tok->tok_mode_stack_index; index >= 0; --index) { + for (index = tok->tok_mode_stack_index; index >= 0; --index) + { mode = &(tok->tok_mode_stack[index]); mode->f_string_start_offset = mode->f_string_start - tok->buf; mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf; @@ -383,7 +408,8 @@ restore_fstring_buffers(struct tok_state *tok) int index; tokenizer_mode *mode; - for (index = tok->tok_mode_stack_index; index >= 0; --index) { + for (index = tok->tok_mode_stack_index; index >= 0; --index) + { mode = &(tok->tok_mode_stack[index]); mode->f_string_start = tok->buf + mode->f_string_start_offset; mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset; @@ -391,21 +417,23 @@ restore_fstring_buffers(struct tok_state *tok) } static int -set_fstring_expr(struct tok_state* tok, struct token *token, char c) { +set_fstring_expr(struct tok_state *tok, struct token *token, char c) +{ assert(token != NULL); assert(c == '}' || c == ':' || c == '!'); tokenizer_mode *tok_mode = TOK_GET_MODE(tok); - if (!tok_mode->f_string_debug || token->metadata) { + if (!tok_mode->f_string_debug || token->metadata) + { return 0; } PyObject *res = PyUnicode_DecodeUTF8( tok_mode->last_expr_buffer, tok_mode->last_expr_size - tok_mode->last_expr_end, - NULL - ); - if (!res) { + NULL); + if (!res) + { return -1; } token->metadata = res; @@ -420,44 +448,49 @@ update_fstring_expr(struct tok_state *tok, char cur) Py_ssize_t size = strlen(tok->cur); tokenizer_mode *tok_mode = TOK_GET_MODE(tok); - switch (cur) { - case 0: - if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { - return 1; - } - char *new_buffer = PyMem_Realloc( - tok_mode->last_expr_buffer, - tok_mode->last_expr_size + size - ); - if (new_buffer == NULL) { - PyMem_Free(tok_mode->last_expr_buffer); - goto error; - } - tok_mode->last_expr_buffer = new_buffer; - strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); - tok_mode->last_expr_size += size; - break; - case '{': - if (tok_mode->last_expr_buffer != NULL) { - PyMem_Free(tok_mode->last_expr_buffer); - } - tok_mode->last_expr_buffer = PyMem_Malloc(size); - if (tok_mode->last_expr_buffer == NULL) { - goto error; - } - tok_mode->last_expr_size = size; - tok_mode->last_expr_end = -1; - strncpy(tok_mode->last_expr_buffer, tok->cur, size); - break; - case '}': - case '!': - case ':': - if (tok_mode->last_expr_end == -1) { - tok_mode->last_expr_end = strlen(tok->start); - } - break; - default: - Py_UNREACHABLE(); + switch (cur) + { + case 0: + if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) + { + return 1; + } + char *new_buffer = PyMem_Realloc( + tok_mode->last_expr_buffer, + tok_mode->last_expr_size + size); + if (new_buffer == NULL) + { + PyMem_Free(tok_mode->last_expr_buffer); + goto error; + } + tok_mode->last_expr_buffer = new_buffer; + strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); + tok_mode->last_expr_size += size; + break; + case '{': + if (tok_mode->last_expr_buffer != NULL) + { + PyMem_Free(tok_mode->last_expr_buffer); + } + tok_mode->last_expr_buffer = PyMem_Malloc(size); + if (tok_mode->last_expr_buffer == NULL) + { + goto error; + } + tok_mode->last_expr_size = size; + tok_mode->last_expr_end = -1; + strncpy(tok_mode->last_expr_buffer, tok->cur, size); + break; + case '}': + case '!': + case ':': + if (tok_mode->last_expr_end == -1) + { + tok_mode->last_expr_end = strlen(tok->start); + } + break; + default: + Py_UNREACHABLE(); } return 1; error: @@ -471,9 +504,11 @@ free_fstring_expressions(struct tok_state *tok) int index; tokenizer_mode *mode; - for (index = tok->tok_mode_stack_index; index >= 0; --index) { + for (index = tok->tok_mode_stack_index; index >= 0; --index) + { mode = &(tok->tok_mode_stack[index]); - if (mode->last_expr_buffer != NULL) { + if (mode->last_expr_buffer != NULL) + { PyMem_Free(mode->last_expr_buffer); mode->last_expr_buffer = NULL; mode->last_expr_size = 0; @@ -503,14 +538,16 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) Py_ssize_t cur = tok->cur - tok->buf; Py_ssize_t oldsize = tok->inp - tok->buf; Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1); - if (newsize > tok->end - tok->buf) { + if (newsize > tok->end - tok->buf) + { char *newbuf = tok->buf; Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf; Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf; Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf; remember_fstring_buffers(tok); newbuf = (char *)PyMem_Realloc(newbuf, newsize); - if (newbuf == NULL) { + if (newbuf == NULL) + { tok->done = E_NOMEM; return 0; } @@ -527,28 +564,34 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) } static inline int -contains_null_bytes(const char* str, size_t size) { +contains_null_bytes(const char *str, size_t size) +{ return memchr(str, 0, size) != NULL; } static int -tok_readline_recode(struct tok_state *tok) { +tok_readline_recode(struct tok_state *tok) +{ PyObject *line; - const char *buf; + const char *buf; Py_ssize_t buflen; line = tok->decoding_buffer; - if (line == NULL) { + if (line == NULL) + { line = PyObject_CallNoArgs(tok->decoding_readline); - if (line == NULL) { + if (line == NULL) + { error_ret(tok); goto error; } } - else { + else + { tok->decoding_buffer = NULL; } buf = PyUnicode_AsUTF8AndSize(line, &buflen); - if (buf == NULL) { + if (buf == NULL) + { error_ret(tok); goto error; } @@ -556,14 +599,16 @@ tok_readline_recode(struct tok_state *tok) { // an extra newline character that we may need to artificially // add. size_t buffer_size = buflen + 2; - if (!tok_reserve_buf(tok, buffer_size)) { + if (!tok_reserve_buf(tok, buffer_size)) + { goto error; } memcpy(tok->inp, buf, buflen); tok->inp += buflen; *tok->inp = '\0'; if (tok->fp_interactive && - tok_concatenate_interactive_new_line(tok, buf) == -1) { + tok_concatenate_interactive_new_line(tok, buf) == -1) + { goto error; } Py_DECREF(line); @@ -584,7 +629,7 @@ tok_readline_recode(struct tok_state *tok) { Return 1 on success, 0 on failure. */ static int -fp_setreadl(struct tok_state *tok, const char* enc) +fp_setreadl(struct tok_state *tok, const char *enc) { PyObject *readline, *open, *stream; int fd; @@ -598,32 +643,38 @@ fp_setreadl(struct tok_state *tok, const char* enc) * the end of line.*/ pos = ftell(tok->fp); if (pos == -1 || - lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { + lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) + { PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); return 0; } open = _PyImport_GetModuleAttrString("io", "open"); - if (open == NULL) { + if (open == NULL) + { return 0; } stream = PyObject_CallFunction(open, "isisOOO", - fd, "r", -1, enc, Py_None, Py_None, Py_False); + fd, "r", -1, enc, Py_None, Py_None, Py_False); Py_DECREF(open); - if (stream == NULL) { + if (stream == NULL) + { return 0; } readline = PyObject_GetAttr(stream, &_Py_ID(readline)); Py_DECREF(stream); - if (readline == NULL) { + if (readline == NULL) + { return 0; } Py_XSETREF(tok->decoding_readline, readline); - if (pos > 0) { + if (pos > 0) + { PyObject *bufobj = _PyObject_CallNoArgs(readline); - if (bufobj == NULL) { + if (bufobj == NULL) + { return 0; } Py_DECREF(bufobj); @@ -634,13 +685,15 @@ fp_setreadl(struct tok_state *tok, const char* enc) /* Fetch the next byte from TOK. */ -static int fp_getc(struct tok_state *tok) { +static int fp_getc(struct tok_state *tok) +{ return getc(tok->fp); } /* Unfetch the last byte back into TOK. */ -static void fp_ungetc(int c, struct tok_state *tok) { +static void fp_ungetc(int c, struct tok_state *tok) +{ ungetc(c, tok->fp); } @@ -650,17 +703,20 @@ static void fp_ungetc(int c, struct tok_state *tok) { those in stringlib/codecs.h:utf8_decode. */ static int -valid_utf8(const unsigned char* s) +valid_utf8(const unsigned char *s) { int expected = 0; int length; - if (*s < 0x80) { + if (*s < 0x80) + { /* single-byte code */ return 1; } - else if (*s < 0xE0) { + else if (*s < 0xE0) + { /* \xC2\x80-\xDF\xBF -- 0080-07FF */ - if (*s < 0xC2) { + if (*s < 0xC2) + { /* invalid sequence \x80-\xBF -- continuation byte \xC0-\xC1 -- fake 0000-007F */ @@ -668,14 +724,17 @@ valid_utf8(const unsigned char* s) } expected = 1; } - else if (*s < 0xF0) { + else if (*s < 0xF0) + { /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ - if (*s == 0xE0 && *(s + 1) < 0xA0) { + if (*s == 0xE0 && *(s + 1) < 0xA0) + { /* invalid sequence \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ return 0; } - else if (*s == 0xED && *(s + 1) >= 0xA0) { + else if (*s == 0xED && *(s + 1) >= 0xA0) + { /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF will result in surrogates in range D800-DFFF. Surrogates are not valid UTF-8 so they are rejected. @@ -685,9 +744,11 @@ valid_utf8(const unsigned char* s) } expected = 2; } - else if (*s < 0xF5) { + else if (*s < 0xF5) + { /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ - if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { + if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) + { /* invalid sequence -- one of: \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF \xF4\x90\x80\x80- -- 110000- overflow */ @@ -695,7 +756,8 @@ valid_utf8(const unsigned char* s) } expected = 3; } - else { + else + { /* invalid start byte */ return 0; } @@ -712,13 +774,16 @@ ensure_utf8(char *line, struct tok_state *tok) int badchar = 0; unsigned char *c; int length; - for (c = (unsigned char *)line; *c; c += length) { - if (!(length = valid_utf8(c))) { + for (c = (unsigned char *)line; *c; c += length) + { + if (!(length = valid_utf8(c))) + { badchar = *c; break; } } - if (badchar) { + if (badchar) + { PyErr_Format(PyExc_SyntaxError, "Non-UTF-8 code starting with '\\x%.2x' " "in file %U on line %i, " @@ -733,23 +798,26 @@ ensure_utf8(char *line, struct tok_state *tok) /* Fetch a byte from TOK, using the string buffer. */ static int -buf_getc(struct tok_state *tok) { +buf_getc(struct tok_state *tok) +{ return Py_CHARMASK(*tok->str++); } /* Unfetch a byte from TOK, using the string buffer. */ static void -buf_ungetc(int c, struct tok_state *tok) { +buf_ungetc(int c, struct tok_state *tok) +{ tok->str--; - assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ + assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ } /* Set the readline function for TOK to ENC. For the string-based tokenizer, this means to just record the encoding. */ static int -buf_setreadl(struct tok_state *tok, const char* enc) { +buf_setreadl(struct tok_state *tok, const char *enc) +{ tok->enc = enc; return 1; } @@ -758,9 +826,10 @@ buf_setreadl(struct tok_state *tok, const char* enc) { C byte string STR, which is encoded with ENC. */ static PyObject * -translate_into_utf8(const char* str, const char* enc) { +translate_into_utf8(const char *str, const char *enc) +{ PyObject *utf8; - PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); + PyObject *buf = PyUnicode_Decode(str, strlen(str), enc, NULL); if (buf == NULL) return NULL; utf8 = PyUnicode_AsUTF8String(buf); @@ -768,29 +837,34 @@ translate_into_utf8(const char* str, const char* enc) { return utf8; } - static char * -translate_newlines(const char *s, int exec_input, struct tok_state *tok) { +translate_newlines(const char *s, int exec_input, struct tok_state *tok) +{ int skip_next_lf = 0; size_t needed_length = strlen(s) + 2, final_length; char *buf, *current; char c = '\0'; buf = PyMem_Malloc(needed_length); - if (buf == NULL) { + if (buf == NULL) + { tok->done = E_NOMEM; return NULL; } - for (current = buf; *s; s++, current++) { + for (current = buf; *s; s++, current++) + { c = *s; - if (skip_next_lf) { + if (skip_next_lf) + { skip_next_lf = 0; - if (c == '\n') { + if (c == '\n') + { c = *++s; if (!c) break; } } - if (c == '\r') { + if (c == '\r') + { skip_next_lf = 1; c = '\n'; } @@ -798,16 +872,19 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { } /* If this is exec input, add a newline to the end of the string if there isn't one already. */ - if (exec_input && c != '\n') { + if (exec_input && c != '\n') + { *current = '\n'; current++; } *current = '\0'; final_length = current - buf + 1; - if (final_length < needed_length && final_length) { + if (final_length < needed_length && final_length) + { /* should never fail */ - char* result = PyMem_Realloc(buf, final_length); - if (result == NULL) { + char *result = PyMem_Realloc(buf, final_length); + if (result == NULL) + { PyMem_Free(buf); } buf = result; @@ -822,7 +899,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) { static char * decode_str(const char *input, int single, struct tok_state *tok) { - PyObject* utf8 = NULL; + PyObject *utf8 = NULL; char *str; const char *s; const char *newl[2] = {NULL, NULL}; @@ -834,37 +911,46 @@ decode_str(const char *input, int single, struct tok_state *tok) tok->str = str; if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) return error_ret(tok); - str = tok->str; /* string after BOM if any */ + str = tok->str; /* string after BOM if any */ assert(str); - if (tok->enc != NULL) { + if (tok->enc != NULL) + { utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) return error_ret(tok); str = PyBytes_AsString(utf8); } - for (s = str;; s++) { - if (*s == '\0') break; - else if (*s == '\n') { + for (s = str;; s++) + { + if (*s == '\0') + break; + else if (*s == '\n') + { assert(lineno < 2); newl[lineno] = s; lineno++; - if (lineno == 2) break; + if (lineno == 2) + break; } } tok->enc = NULL; /* need to check line 1 and 2 separately since check_coding_spec assumes a single line as input */ - if (newl[0]) { - if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { + if (newl[0]) + { + if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) + { return NULL; } - if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { - if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], + if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) + { + if (!check_coding_spec(newl[0] + 1, newl[1] - newl[0], tok, buf_setreadl)) return NULL; } } - if (tok->enc != NULL) { + if (tok->enc != NULL) + { assert(utf8 == NULL); utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) @@ -887,7 +973,8 @@ _PyTokenizer_FromString(const char *str, int exec_input) if (tok == NULL) return NULL; decoded = decode_str(str, exec_input, tok); - if (decoded == NULL) { + if (decoded == NULL) + { _PyTokenizer_Free(tok); return NULL; } @@ -907,7 +994,8 @@ _PyTokenizer_FromUTF8(const char *str, int exec_input) if (tok == NULL) return NULL; tok->input = translated = translate_newlines(str, exec_input, tok); - if (translated == NULL) { + if (translated == NULL) + { _PyTokenizer_Free(tok); return NULL; } @@ -915,7 +1003,8 @@ _PyTokenizer_FromUTF8(const char *str, int exec_input) tok->enc = NULL; tok->str = translated; tok->encoding = new_string("utf-8", 5, tok); - if (!tok->encoding) { + if (!tok->encoding) + { _PyTokenizer_Free(tok); return NULL; } @@ -928,13 +1017,14 @@ _PyTokenizer_FromUTF8(const char *str, int exec_input) /* Set up tokenizer for file */ struct tok_state * -_PyTokenizer_FromFile(FILE *fp, const char* enc, +_PyTokenizer_FromFile(FILE *fp, const char *enc, const char *ps1, const char *ps2) { struct tok_state *tok = tok_new(); if (tok == NULL) return NULL; - if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { + if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) + { _PyTokenizer_Free(tok); return NULL; } @@ -943,11 +1033,13 @@ _PyTokenizer_FromFile(FILE *fp, const char* enc, tok->fp = fp; tok->prompt = ps1; tok->nextprompt = ps2; - if (enc != NULL) { + if (enc != NULL) + { /* Must copy encoding declaration since it gets copied into the parse tree. */ tok->encoding = new_string(enc, strlen(enc), tok); - if (!tok->encoding) { + if (!tok->encoding) + { _PyTokenizer_Free(tok); return NULL; } @@ -958,22 +1050,25 @@ _PyTokenizer_FromFile(FILE *fp, const char* enc, /* Free a tok_state structure */ -void -_PyTokenizer_Free(struct tok_state *tok) +void _PyTokenizer_Free(struct tok_state *tok) { - if (tok->encoding != NULL) { + if (tok->encoding != NULL) + { PyMem_Free(tok->encoding); } Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); Py_XDECREF(tok->filename); - if (tok->fp != NULL && tok->buf != NULL) { + if (tok->fp != NULL && tok->buf != NULL) + { PyMem_Free(tok->buf); } - if (tok->input) { + if (tok->input) + { PyMem_Free(tok->input); } - if (tok->interactive_src_start != NULL) { + if (tok->interactive_src_start != NULL) + { PyMem_Free(tok->interactive_src_start); } free_fstring_expressions(tok); @@ -983,25 +1078,31 @@ _PyTokenizer_Free(struct tok_state *tok) static int tok_readline_raw(struct tok_state *tok) { - do { - if (!tok_reserve_buf(tok, BUFSIZ)) { + do + { + if (!tok_reserve_buf(tok, BUFSIZ)) + { return 0; } int n_chars = (int)(tok->end - tok->inp); size_t line_size = 0; char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size); - if (line == NULL) { + if (line == NULL) + { return 1; } - if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) + { return 0; } if (tok->fp_interactive && - tok_concatenate_interactive_new_line(tok, line) == -1) { + tok_concatenate_interactive_new_line(tok, line) == -1) + { return 0; } tok->inp += line_size; - if (tok->inp == tok->buf) { + if (tok->inp == tok->buf) + { return 0; } } while (tok->inp[-1] != '\n'); @@ -1009,19 +1110,24 @@ tok_readline_raw(struct tok_state *tok) } static int -tok_underflow_string(struct tok_state *tok) { +tok_underflow_string(struct tok_state *tok) +{ char *end = strchr(tok->inp, '\n'); - if (end != NULL) { + if (end != NULL) + { end++; } - else { + else + { end = strchr(tok->inp, '\0'); - if (end == tok->inp) { + if (end == tok->inp) + { tok->done = E_EOF; return 0; } } - if (tok->start == NULL) { + if (tok->start == NULL) + { tok->buf = tok->cur; } tok->line_start = tok->cur; @@ -1031,34 +1137,41 @@ tok_underflow_string(struct tok_state *tok) { } static int -tok_underflow_interactive(struct tok_state *tok) { - if (tok->interactive_underflow == IUNDERFLOW_STOP) { +tok_underflow_interactive(struct tok_state *tok) +{ + if (tok->interactive_underflow == IUNDERFLOW_STOP) + { tok->done = E_INTERACT_STOP; return 1; } char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); - if (newtok != NULL) { + if (newtok != NULL) + { char *translated = translate_newlines(newtok, 0, tok); PyMem_Free(newtok); - if (translated == NULL) { + if (translated == NULL) + { return 0; } newtok = translated; } - if (tok->encoding && newtok && *newtok) { + if (tok->encoding && newtok && *newtok) + { /* Recode to UTF-8 */ Py_ssize_t buflen; - const char* buf; + const char *buf; PyObject *u = translate_into_utf8(newtok, tok->encoding); PyMem_Free(newtok); - if (u == NULL) { + if (u == NULL) + { tok->done = E_DECODE; return 0; } buflen = PyBytes_GET_SIZE(u); buf = PyBytes_AS_STRING(u); - newtok = PyMem_Malloc(buflen+1); - if (newtok == NULL) { + newtok = PyMem_Malloc(buflen + 1); + if (newtok == NULL) + { Py_DECREF(u); tok->done = E_NOMEM; return 0; @@ -1067,26 +1180,32 @@ tok_underflow_interactive(struct tok_state *tok) { Py_DECREF(u); } if (tok->fp_interactive && - tok_concatenate_interactive_new_line(tok, newtok) == -1) { + tok_concatenate_interactive_new_line(tok, newtok) == -1) + { PyMem_Free(newtok); return 0; } - if (tok->nextprompt != NULL) { + if (tok->nextprompt != NULL) + { tok->prompt = tok->nextprompt; } - if (newtok == NULL) { + if (newtok == NULL) + { tok->done = E_INTR; } - else if (*newtok == '\0') { + else if (*newtok == '\0') + { PyMem_Free(newtok); tok->done = E_EOF; } - else if (tok->start != NULL) { + else if (tok->start != NULL) + { Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; remember_fstring_buffers(tok); size_t size = strlen(newtok); ADVANCE_LINENO(); - if (!tok_reserve_buf(tok, size + 1)) { + if (!tok_reserve_buf(tok, size + 1)) + { PyMem_Free(tok->buf); tok->buf = NULL; PyMem_Free(newtok); @@ -1098,7 +1217,8 @@ tok_underflow_interactive(struct tok_state *tok) { tok->multi_line_start = tok->buf + cur_multi_line_start; restore_fstring_buffers(tok); } - else { + else + { remember_fstring_buffers(tok); ADVANCE_LINENO(); PyMem_Free(tok->buf); @@ -1109,52 +1229,65 @@ tok_underflow_interactive(struct tok_state *tok) { tok->end = tok->inp + 1; restore_fstring_buffers(tok); } - if (tok->done != E_OK) { - if (tok->prompt != NULL) { + if (tok->done != E_OK) + { + if (tok->prompt != NULL) + { PySys_WriteStderr("\n"); } return 0; } - if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) + { return 0; } return 1; } static int -tok_underflow_file(struct tok_state *tok) { - if (tok->start == NULL) { +tok_underflow_file(struct tok_state *tok) +{ + if (tok->start == NULL) + { tok->cur = tok->inp = tok->buf; } - if (tok->decoding_state == STATE_INIT) { + if (tok->decoding_state == STATE_INIT) + { /* We have not yet determined the encoding. If an encoding is found, use the file-pointer reader functions from now on. */ - if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { + if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) + { error_ret(tok); return 0; } assert(tok->decoding_state != STATE_INIT); } /* Read until '\n' or EOF */ - if (tok->decoding_readline != NULL) { + if (tok->decoding_readline != NULL) + { /* We already have a codec associated with this input. */ - if (!tok_readline_recode(tok)) { + if (!tok_readline_recode(tok)) + { return 0; } } - else { + else + { /* We want a 'raw' read. */ - if (!tok_readline_raw(tok)) { + if (!tok_readline_raw(tok)) + { return 0; } } - if (tok->inp == tok->cur) { + if (tok->inp == tok->cur) + { tok->done = E_EOF; return 0; } - if (tok->inp[-1] != '\n') { + if (tok->inp[-1] != '\n') + { assert(tok->inp + 1 < tok->end); /* Last line does not end in \n, fake one */ *tok->inp++ = '\n'; @@ -1162,8 +1295,10 @@ tok_underflow_file(struct tok_state *tok) { } ADVANCE_LINENO(); - if (tok->decoding_state != STATE_NORMAL) { - if (tok->lineno > 2) { + if (tok->decoding_state != STATE_NORMAL) + { + if (tok->lineno > 2) + { tok->decoding_state = STATE_NORMAL; } else if (!check_coding_spec(tok->cur, strlen(tok->cur), @@ -1174,7 +1309,8 @@ tok_underflow_file(struct tok_state *tok) { } /* The default encoding is UTF-8, so make sure we don't have any non-UTF-8 sequences in it. */ - if (!tok->encoding && !ensure_utf8(tok->cur, tok)) { + if (!tok->encoding && !ensure_utf8(tok->cur, tok)) + { error_ret(tok); return 0; } @@ -1186,25 +1322,40 @@ tok_underflow_file(struct tok_state *tok) { static void print_escape(FILE *f, const char *s, Py_ssize_t size) { - if (s == NULL) { + if (s == NULL) + { fputs("NULL", f); return; } putc('"', f); - while (size-- > 0) { + while (size-- > 0) + { unsigned char c = *s++; - switch (c) { - case '\n': fputs("\\n", f); break; - case '\r': fputs("\\r", f); break; - case '\t': fputs("\\t", f); break; - case '\f': fputs("\\f", f); break; - case '\'': fputs("\\'", f); break; - case '"': fputs("\\\"", f); break; - default: - if (0x20 <= c && c <= 0x7f) - putc(c, f); - else - fprintf(f, "\\x%02x", c); + switch (c) + { + case '\n': + fputs("\\n", f); + break; + case '\r': + fputs("\\r", f); + break; + case '\t': + fputs("\\t", f); + break; + case '\f': + fputs("\\f", f); + break; + case '\'': + fputs("\\'", f); + break; + case '"': + fputs("\\\"", f); + break; + default: + if (0x20 <= c && c <= 0x7f) + putc(c, f); + else + fprintf(f, "\\x%02x", c); } } putc('"', f); @@ -1217,37 +1368,46 @@ static int tok_nextc(struct tok_state *tok) { int rc; - for (;;) { - if (tok->cur != tok->inp) { + for (;;) + { + if (tok->cur != tok->inp) + { tok->col_offset++; return Py_CHARMASK(*tok->cur++); /* Fast path */ } - if (tok->done != E_OK) { + if (tok->done != E_OK) + { return EOF; } - if (tok->fp == NULL) { + if (tok->fp == NULL) + { rc = tok_underflow_string(tok); } - else if (tok->prompt != NULL) { + else if (tok->prompt != NULL) + { rc = tok_underflow_interactive(tok); } - else { + else + { rc = tok_underflow_file(tok); } #if defined(Py_DEBUG) - if (tok->debug) { + if (tok->debug) + { fprintf(stderr, "line[%d] = ", tok->lineno); print_escape(stderr, tok->cur, tok->inp - tok->cur); fprintf(stderr, " tok->done = %d\n", tok->done); } #endif - if (!rc) { + if (!rc) + { tok->cur = tok->inp; return EOF; } tok->line_start = tok->cur; - if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { + if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) + { syntaxerror(tok, "source code cannot contain null bytes"); tok->cur = tok->inp; return EOF; @@ -1261,11 +1421,14 @@ tok_nextc(struct tok_state *tok) static void tok_backup(struct tok_state *tok, int c) { - if (c != EOF) { - if (--tok->cur < tok->buf) { + if (c != EOF) + { + if (--tok->cur < tok->buf) + { Py_FatalError("tokenizer beginning of buffer"); } - if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) { + if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) + { Py_FatalError("tok_backup: wrong character"); } tok->col_offset--; @@ -1279,36 +1442,43 @@ _syntaxerror_range(struct tok_state *tok, const char *format, { PyObject *errmsg, *errtext, *args; errmsg = PyUnicode_FromFormatV(format, vargs); - if (!errmsg) { + if (!errmsg) + { goto error; } errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, "replace"); - if (!errtext) { + if (!errtext) + { goto error; } - if (col_offset == -1) { + if (col_offset == -1) + { col_offset = (int)PyUnicode_GET_LENGTH(errtext); } - if (end_col_offset == -1) { + if (end_col_offset == -1) + { end_col_offset = col_offset; } Py_ssize_t line_len = strcspn(tok->line_start, "\n"); - if (line_len != tok->cur - tok->line_start) { + if (line_len != tok->cur - tok->line_start) + { Py_DECREF(errtext); errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, "replace"); } - if (!errtext) { + if (!errtext) + { goto error; } args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno, col_offset, errtext, tok->lineno, end_col_offset); - if (args) { + if (args) + { PyErr_SetObject(PyExc_SyntaxError, args); Py_DECREF(args); } @@ -1353,7 +1523,8 @@ indenterror(struct tok_state *tok) static int parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) { - if (!tok->report_warnings) { + if (!tok->report_warnings) + { return 0; } @@ -1362,13 +1533,16 @@ parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) va_start(vargs, format); errmsg = PyUnicode_FromFormatV(format, vargs); va_end(vargs); - if (!errmsg) { + if (!errmsg) + { goto error; } if (PyErr_WarnExplicitObject(category, errmsg, tok->filename, - tok->lineno, NULL, NULL) < 0) { - if (PyErr_ExceptionMatches(category)) { + tok->lineno, NULL, NULL) < 0) + { + if (PyErr_ExceptionMatches(category)) + { /* Replace the DeprecationWarning exception with a SyntaxError to get a more accurate error report */ PyErr_Clear(); @@ -1389,28 +1563,31 @@ static int warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char) { - if (!tok->tok_report_warnings) { + if (!tok->tok_report_warnings) + { return 0; } PyObject *msg = PyUnicode_FromFormat( "invalid escape sequence '\\%c'", - (char) first_invalid_escape_char - ); + (char)first_invalid_escape_char); - if (msg == NULL) { + if (msg == NULL) + { return -1; } if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, tok->filename, - tok->lineno, NULL, NULL) < 0) { + tok->lineno, NULL, NULL) < 0) + { Py_DECREF(msg); - if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) { + if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) + { /* Replace the DeprecationWarning exception with a SyntaxError to get a more accurate error report */ PyErr_Clear(); - return syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char); + return syntaxerror(tok, "invalid escape sequence '\\%c'", (char)first_invalid_escape_char); } return -1; @@ -1425,18 +1602,22 @@ lookahead(struct tok_state *tok, const char *test) { const char *s = test; int res = 0; - while (1) { + while (1) + { int c = tok_nextc(tok); - if (*s == 0) { + if (*s == 0) + { res = !is_potential_identifier_char(c); } - else if (c == *s) { + else if (c == *s) + { s++; continue; } tok_backup(tok, c); - while (s != test) { + while (s != test) + { tok_backup(tok, *--s); } return res; @@ -1457,43 +1638,52 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind) * other keyword or identifier. */ int r = 0; - if (c == 'a') { + if (c == 'a') + { r = lookahead(tok, "nd"); } - else if (c == 'e') { + else if (c == 'e') + { r = lookahead(tok, "lse"); } - else if (c == 'f') { + else if (c == 'f') + { r = lookahead(tok, "or"); } - else if (c == 'i') { + else if (c == 'i') + { int c2 = tok_nextc(tok); - if (c2 == 'f' || c2 == 'n' || c2 == 's') { + if (c2 == 'f' || c2 == 'n' || c2 == 's') + { r = 1; } tok_backup(tok, c2); } - else if (c == 'o') { + else if (c == 'o') + { r = lookahead(tok, "r"); } - else if (c == 'n') { + else if (c == 'n') + { r = lookahead(tok, "ot"); } - if (r) { + if (r) + { tok_backup(tok, c); if (parser_warn(tok, PyExc_SyntaxWarning, - "invalid %s literal", kind)) + "invalid %s literal", kind)) { return 0; } tok_nextc(tok); } else /* In future releases, only error will remain. */ - if (is_potential_identifier_char(c)) { - tok_backup(tok, c); - syntaxerror(tok, "invalid %s literal", kind); - return 0; - } + if (is_potential_identifier_char(c)) + { + tok_backup(tok, c); + syntaxerror(tok, "invalid %s literal", kind); + return 0; + } return 1; } @@ -1507,31 +1697,39 @@ verify_identifier(struct tok_state *tok) if (tok->decoding_erred) return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); - if (s == NULL) { - if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { + if (s == NULL) + { + if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) + { tok->done = E_DECODE; } - else { + else + { tok->done = E_ERROR; } return 0; } Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); - if (invalid < 0) { + if (invalid < 0) + { Py_DECREF(s); tok->done = E_ERROR; return 0; } assert(PyUnicode_GET_LENGTH(s) > 0); - if (invalid < PyUnicode_GET_LENGTH(s)) { + if (invalid < PyUnicode_GET_LENGTH(s)) + { Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); - if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { + if (invalid + 1 < PyUnicode_GET_LENGTH(s)) + { /* Determine the offset in UTF-8 encoded input */ Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); - if (s != NULL) { + if (s != NULL) + { Py_SETREF(s, PyUnicode_AsUTF8String(s)); } - if (s == NULL) { + if (s == NULL) + { tok->done = E_ERROR; return 0; } @@ -1541,10 +1739,12 @@ verify_identifier(struct tok_state *tok) // PyUnicode_FromFormatV() does not support %X char hex[9]; (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch); - if (Py_UNICODE_ISPRINTABLE(ch)) { + if (Py_UNICODE_ISPRINTABLE(ch)) + { syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); } - else { + else + { syntaxerror(tok, "invalid non-printable character U+%s", hex); } return 0; @@ -1558,15 +1758,19 @@ tok_decimal_tail(struct tok_state *tok) { int c; - while (1) { - do { + while (1) + { + do + { c = tok_nextc(tok); } while (isdigit(c)); - if (c != '_') { + if (c != '_') + { break; } c = tok_nextc(tok); - if (!isdigit(c)) { + if (!isdigit(c)) + { tok_backup(tok, c); syntaxerror(tok, "invalid decimal literal"); return 0; @@ -1575,20 +1779,24 @@ tok_decimal_tail(struct tok_state *tok) return c; } - static inline int -tok_continuation_line(struct tok_state *tok) { +tok_continuation_line(struct tok_state *tok) +{ int c = tok_nextc(tok); - if (c != '\n') { + if (c != '\n') + { tok->done = E_LINECONT; return -1; } c = tok_nextc(tok); - if (c == EOF) { + if (c == EOF) + { tok->done = E_EOF; tok->cur = tok->inp; return -1; - } else { + } + else + { tok_backup(tok, c); } return c; @@ -1612,10 +1820,12 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st { assert((start == NULL && end == NULL) || (start != NULL && end != NULL)); token->level = tok->level; - if (ISSTRINGLIT(type)) { + if (ISSTRINGLIT(type)) + { token->lineno = tok->first_lineno; } - else { + else + { token->lineno = tok->lineno; } token->end_lineno = tok->lineno; @@ -1623,7 +1833,8 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st token->start = start; token->end = end; - if (start != NULL && end != NULL) { + if (start != NULL && end != NULL) + { token->col_offset = tok->starting_col_offset; token->end_col_offset = tok->col_offset; } @@ -1631,108 +1842,130 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st } static int -tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) +tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct token *token) { int c; int blankline, nonascii; const char *p_start = NULL; const char *p_end = NULL; - nextline: +nextline: tok->start = NULL; tok->starting_col_offset = -1; blankline = 0; /* Get indentation level */ - if (tok->atbol) { + if (tok->atbol) + { int col = 0; int altcol = 0; tok->atbol = 0; int cont_line_col = 0; - for (;;) { + for (;;) + { c = tok_nextc(tok); - if (c == ' ') { + if (c == ' ') + { col++, altcol++; } - else if (c == '\t') { + else if (c == '\t') + { col = (col / tok->tabsize + 1) * tok->tabsize; altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; } - else if (c == '\014') {/* Control-L (formfeed) */ + else if (c == '\014') + { /* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ } - else if (c == '\\') { + else if (c == '\\') + { // Indentation cannot be split over multiple physical lines // using backslashes. This means that if we found a backslash // preceded by whitespace, **the first one we find** determines // the level of indentation of whatever comes next. cont_line_col = cont_line_col ? cont_line_col : col; - if ((c = tok_continuation_line(tok)) == -1) { + if ((c = tok_continuation_line(tok)) == -1) + { return MAKE_TOKEN(ERRORTOKEN); } } - else { + else + { break; } } tok_backup(tok, c); - if (c == '#' || c == '\n') { + if (c == '#' || c == '\n') + { /* Lines with only whitespace and/or comments shouldn't affect the indentation and are not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */ - if (col == 0 && c == '\n' && tok->prompt != NULL) { + if (col == 0 && c == '\n' && tok->prompt != NULL) + { blankline = 0; /* Let it through */ } - else if (tok->prompt != NULL && tok->lineno == 1) { + else if (tok->prompt != NULL && tok->lineno == 1) + { /* In interactive mode, if the first line contains only spaces and/or a comment, let it through. */ blankline = 0; col = altcol = 0; } - else { + else + { blankline = 1; /* Ignore completely */ } /* We can't jump back right here since we still may need to skip to the end of a comment */ } - if (!blankline && tok->level == 0) { + if (!blankline && tok->level == 0) + { col = cont_line_col ? cont_line_col : col; altcol = cont_line_col ? cont_line_col : altcol; - if (col == tok->indstack[tok->indent]) { + if (col == tok->indstack[tok->indent]) + { /* No change */ - if (altcol != tok->altindstack[tok->indent]) { + if (altcol != tok->altindstack[tok->indent]) + { return MAKE_TOKEN(indenterror(tok)); } } - else if (col > tok->indstack[tok->indent]) { + else if (col > tok->indstack[tok->indent]) + { /* Indent -- always one */ - if (tok->indent+1 >= MAXINDENT) { + if (tok->indent + 1 >= MAXINDENT) + { tok->done = E_TOODEEP; tok->cur = tok->inp; return MAKE_TOKEN(ERRORTOKEN); } - if (altcol <= tok->altindstack[tok->indent]) { + if (altcol <= tok->altindstack[tok->indent]) + { return MAKE_TOKEN(indenterror(tok)); } tok->pendin++; tok->indstack[++tok->indent] = col; tok->altindstack[tok->indent] = altcol; } - else /* col < tok->indstack[tok->indent] */ { + else /* col < tok->indstack[tok->indent] */ + { /* Dedent -- any number, must be consistent */ while (tok->indent > 0 && - col < tok->indstack[tok->indent]) { + col < tok->indstack[tok->indent]) + { tok->pendin--; tok->indent--; } - if (col != tok->indstack[tok->indent]) { + if (col != tok->indstack[tok->indent]) + { tok->done = E_DEDENT; tok->cur = tok->inp; return MAKE_TOKEN(ERRORTOKEN); } - if (altcol != tok->altindstack[tok->indent]) { + if (altcol != tok->altindstack[tok->indent]) + { return MAKE_TOKEN(indenterror(tok)); } } @@ -1743,12 +1976,15 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->starting_col_offset = tok->col_offset; /* Return pending indents/dedents */ - if (tok->pendin != 0) { - if (tok->pendin < 0) { + if (tok->pendin != 0) + { + if (tok->pendin < 0) + { tok->pendin++; return MAKE_TOKEN(DEDENT); } - else { + else + { tok->pendin--; return MAKE_TOKEN(INDENT); } @@ -1758,15 +1994,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t c = tok_nextc(tok); tok_backup(tok, c); /* Check if we are closing an async function */ - if (tok->async_def - && !blankline + if (tok->async_def && !blankline /* Due to some implementation artifacts of type comments, * a TYPE_COMMENT at the start of a function won't set an * indentation level and it will produce a NEWLINE after it. * To avoid spuriously ending an async function due to this, * wait until we have some non-newline char in front of us. */ - && c != '\n' - && tok->level == 0 + && c != '\n' && tok->level == 0 /* There was a NEWLINE after ASYNC DEF, so we're past the signature. */ && tok->async_def_nl @@ -1779,10 +2013,11 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->async_def_nl = 0; } - again: +again: tok->start = NULL; /* Skip spaces */ - do { + do + { c = tok_nextc(tok); } while (c == ' ' || c == '\t' || c == '\014'); @@ -1791,33 +2026,44 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t tok->starting_col_offset = tok->col_offset - 1; /* Skip comment, unless it's a type comment */ - if (c == '#') { + if (c == '#') + { - if (INSIDE_FSTRING(tok)) { + if (INSIDE_FSTRING(tok)) + { return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'")); } const char *prefix, *p, *type_start; int current_starting_col_offset; - while (c != EOF && c != '\n') { + while (c != EOF && c != '\n') + { c = tok_nextc(tok); } - if (tok->type_comments) { + if (tok->type_comments) + { p = tok->start; current_starting_col_offset = tok->starting_col_offset; prefix = type_comment_prefix; - while (*prefix && p < tok->cur) { - if (*prefix == ' ') { - while (*p == ' ' || *p == '\t') { + while (*prefix && p < tok->cur) + { + if (*prefix == ' ') + { + while (*p == ' ' || *p == '\t') + { p++; current_starting_col_offset++; } - } else if (*prefix == *p) { + } + else if (*prefix == *p) + { p++; current_starting_col_offset++; - } else { + } + else + { break; } @@ -1825,33 +2071,35 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } /* This is a type comment if we matched all of type_comment_prefix. */ - if (!*prefix) { + if (!*prefix) + { int is_type_ignore = 1; // +6 in order to skip the word 'ignore' const char *ignore_end = p + 6; const int ignore_end_col_offset = current_starting_col_offset + 6; - tok_backup(tok, c); /* don't eat the newline or EOF */ + tok_backup(tok, c); /* don't eat the newline or EOF */ type_start = p; /* A TYPE_IGNORE is "type: ignore" followed by the end of the token * or anything ASCII and non-alphanumeric. */ - is_type_ignore = ( - tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 - && !(tok->cur > ignore_end - && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); + is_type_ignore = (tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 && !(tok->cur > ignore_end && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); - if (is_type_ignore) { + if (is_type_ignore) + { p_start = ignore_end; p_end = tok->cur; /* If this type ignore is the only thing on the line, consume the newline also. */ - if (blankline) { + if (blankline) + { tok_nextc(tok); tok->atbol = 1; } return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); - } else { + } + else + { p_start = type_start; p_end = tok->cur; return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); @@ -1860,13 +2108,16 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } } - if (tok->done == E_INTERACT_STOP) { + if (tok->done == E_INTERACT_STOP) + { return MAKE_TOKEN(ENDMARKER); } /* Check for EOF and errors now */ - if (c == EOF) { - if (tok->level) { + if (c == EOF) + { + if (tok->level) + { return MAKE_TOKEN(ERRORTOKEN); } return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); @@ -1874,44 +2125,54 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Identifier (most frequent token!) */ nonascii = 0; - if (is_potential_identifier_start(c)) { + if (is_potential_identifier_start(c)) + { /* Process the various legal combinations of b"", r"", u"", and f"". */ int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; - while (1) { + while (1) + { if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) saw_b = 1; /* Since this is a backwards compatibility support literal we don't want to support it in arbitrary order like byte literals. */ - else if (!(saw_b || saw_u || saw_r || saw_f) - && (c == 'u'|| c == 'U')) { + else if (!(saw_b || saw_u || saw_r || saw_f) && (c == 'u' || c == 'U')) + { saw_u = 1; } /* ur"" and ru"" are not supported */ - else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { + else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) + { saw_r = 1; } - else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { + else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) + { saw_f = 1; } - else { + else + { break; } c = tok_nextc(tok); - if (c == '"' || c == '\'') { - if (saw_f) { + if (c == '"' || c == '\'') + { + if (saw_f) + { goto f_string_quote; } goto letter_quote; } } - while (is_potential_identifier_char(c)) { - if (c >= 128) { + while (is_potential_identifier_char(c)) + { + if (c >= 128) + { nonascii = 1; } c = tok_nextc(tok); } tok_backup(tok, c); - if (nonascii && !verify_identifier(tok)) { + if (nonascii && !verify_identifier(tok)) + { return MAKE_TOKEN(ERRORTOKEN); } @@ -1919,7 +2180,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t p_end = tok->cur; /* async/await parsing block. */ - if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { + if (tok->cur - tok->start == 5 && tok->start[0] == 'a') + { /* May be an 'async' or 'await' token. For Python 3.7 or later we recognize them unconditionally. For Python 3.5 or 3.6 we recognize 'async' in front of 'def', and @@ -1928,16 +2190,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t but there's no *valid* Python 3.4 code that would be rejected, and async functions will be rejected in a later phase.) */ - if (!tok->async_hacks || tok->async_def) { + if (!tok->async_hacks || tok->async_def) + { /* Always recognize the keywords. */ - if (memcmp(tok->start, "async", 5) == 0) { + if (memcmp(tok->start, "async", 5) == 0) + { return MAKE_TOKEN(ASYNC); } - if (memcmp(tok->start, "await", 5) == 0) { + if (memcmp(tok->start, "await", 5) == 0) + { return MAKE_TOKEN(AWAIT); } } - else if (memcmp(tok->start, "async", 5) == 0) { + else if (memcmp(tok->start, "async", 5) == 0) + { /* The current token is 'async'. Look ahead one token to see if that is 'def'. */ @@ -1950,9 +2216,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t current_tok, &ahead_token); - if (ahead_tok_kind == NAME - && ahead_tok.cur - ahead_tok.start == 3 - && memcmp(ahead_tok.start, "def", 3) == 0) + if (ahead_tok_kind == NAME && ahead_tok.cur - ahead_tok.start == 3 && memcmp(ahead_tok.start, "def", 3) == 0) { /* The next token is going to be 'def', so instead of returning a plain NAME token, return ASYNC. */ @@ -1967,15 +2231,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } /* Newline */ - if (c == '\n') { + if (c == '\n') + { tok->atbol = 1; - if (blankline || tok->level > 0) { + if (blankline || tok->level > 0) + { goto nextline; } p_start = tok->start; p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; - if (tok->async_def) { + if (tok->async_def) + { /* We're somewhere inside an 'async def' function, and we've encountered a NEWLINE after its signature. */ tok->async_def_nl = 1; @@ -1984,23 +2251,30 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } /* Period or number starting with period? */ - if (c == '.') { + if (c == '.') + { c = tok_nextc(tok); - if (isdigit(c)) { + if (isdigit(c)) + { goto fraction; - } else if (c == '.') { + } + else if (c == '.') + { c = tok_nextc(tok); - if (c == '.') { + if (c == '.') + { p_start = tok->start; p_end = tok->cur; return MAKE_TOKEN(ELLIPSIS); } - else { + else + { tok_backup(tok, c); } tok_backup(tok, '.'); } - else { + else + { tok_backup(tok, c); } p_start = tok->start; @@ -2009,169 +2283,218 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } /* Number */ - if (isdigit(c)) { - if (c == '0') { + if (isdigit(c)) + { + if (c == '0') + { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); - if (c == 'x' || c == 'X') { + if (c == 'x' || c == 'X') + { /* Hex */ c = tok_nextc(tok); - do { - if (c == '_') { + do + { + if (c == '_') + { c = tok_nextc(tok); } - if (!isxdigit(c)) { + if (!isxdigit(c)) + { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal")); } - do { + do + { c = tok_nextc(tok); } while (isxdigit(c)); } while (c == '_'); - if (!verify_end_of_number(tok, c, "hexadecimal")) { + if (!verify_end_of_number(tok, c, "hexadecimal")) + { return MAKE_TOKEN(ERRORTOKEN); } } - else if (c == 'o' || c == 'O') { + else if (c == 'o' || c == 'O') + { /* Octal */ c = tok_nextc(tok); - do { - if (c == '_') { + do + { + if (c == '_') + { c = tok_nextc(tok); } - if (c < '0' || c >= '8') { - if (isdigit(c)) { + if (c < '0' || c >= '8') + { + if (isdigit(c)) + { return MAKE_TOKEN(syntaxerror(tok, - "invalid digit '%c' in octal literal", c)); + "invalid digit '%c' in octal literal", c)); } - else { + else + { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal")); } } - do { + do + { c = tok_nextc(tok); } while ('0' <= c && c < '8'); } while (c == '_'); - if (isdigit(c)) { + if (isdigit(c)) + { return MAKE_TOKEN(syntaxerror(tok, - "invalid digit '%c' in octal literal", c)); + "invalid digit '%c' in octal literal", c)); } - if (!verify_end_of_number(tok, c, "octal")) { + if (!verify_end_of_number(tok, c, "octal")) + { return MAKE_TOKEN(ERRORTOKEN); } } - else if (c == 'b' || c == 'B') { + else if (c == 'b' || c == 'B') + { /* Binary */ c = tok_nextc(tok); - do { - if (c == '_') { + do + { + if (c == '_') + { c = tok_nextc(tok); } - if (c != '0' && c != '1') { - if (isdigit(c)) { + if (c != '0' && c != '1') + { + if (isdigit(c)) + { return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } - else { + else + { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal")); } } - do { + do + { c = tok_nextc(tok); } while (c == '0' || c == '1'); } while (c == '_'); - if (isdigit(c)) { + if (isdigit(c)) + { return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } - if (!verify_end_of_number(tok, c, "binary")) { + if (!verify_end_of_number(tok, c, "binary")) + { return MAKE_TOKEN(ERRORTOKEN); } } - else { + else + { int nonzero = 0; /* maybe old-style octal; c is first char of it */ /* in any case, allow '0' as a literal */ - while (1) { - if (c == '_') { + while (1) + { + if (c == '_') + { c = tok_nextc(tok); - if (!isdigit(c)) { + if (!isdigit(c)) + { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } - if (c != '0') { + if (c != '0') + { break; } c = tok_nextc(tok); } - char* zeros_end = tok->cur; - if (isdigit(c)) { + char *zeros_end = tok->cur; + if (isdigit(c)) + { nonzero = 1; c = tok_decimal_tail(tok); - if (c == 0) { + if (c == 0) + { return MAKE_TOKEN(ERRORTOKEN); } } - if (c == '.') { + if (c == '.') + { c = tok_nextc(tok); goto fraction; } - else if (c == 'e' || c == 'E') { + else if (c == 'e' || c == 'E') + { goto exponent; } - else if (c == 'j' || c == 'J') { + else if (c == 'j' || c == 'J') + { goto imaginary; } - else if (nonzero) { + else if (nonzero) + { /* Old-style octal: now disallowed. */ tok_backup(tok, c); return MAKE_TOKEN(syntaxerror_known_range( - tok, (int)(tok->start + 1 - tok->line_start), - (int)(zeros_end - tok->line_start), - "leading zeros in decimal integer " - "literals are not permitted; " - "use an 0o prefix for octal integers")); + tok, (int)(tok->start + 1 - tok->line_start), + (int)(zeros_end - tok->line_start), + "leading zeros in decimal integer " + "literals are not permitted; " + "use an 0o prefix for octal integers")); } - if (!verify_end_of_number(tok, c, "decimal")) { + if (!verify_end_of_number(tok, c, "decimal")) + { return MAKE_TOKEN(ERRORTOKEN); } } } - else { + else + { /* Decimal */ c = tok_decimal_tail(tok); - if (c == 0) { + if (c == 0) + { return MAKE_TOKEN(ERRORTOKEN); } { /* Accept floating point numbers. */ - if (c == '.') { + if (c == '.') + { c = tok_nextc(tok); - fraction: + fraction: /* Fraction */ - if (isdigit(c)) { + if (isdigit(c)) + { c = tok_decimal_tail(tok); - if (c == 0) { + if (c == 0) + { return MAKE_TOKEN(ERRORTOKEN); } } } - if (c == 'e' || c == 'E') { + if (c == 'e' || c == 'E') + { int e; - exponent: + exponent: e = c; /* Exponent part */ c = tok_nextc(tok); - if (c == '+' || c == '-') { + if (c == '+' || c == '-') + { c = tok_nextc(tok); - if (!isdigit(c)) { + if (!isdigit(c)) + { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } - } else if (!isdigit(c)) { + } + else if (!isdigit(c)) + { tok_backup(tok, c); - if (!verify_end_of_number(tok, e, "decimal")) { + if (!verify_end_of_number(tok, e, "decimal")) + { return MAKE_TOKEN(ERRORTOKEN); } tok_backup(tok, e); @@ -2180,19 +2503,23 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t return MAKE_TOKEN(NUMBER); } c = tok_decimal_tail(tok); - if (c == 0) { + if (c == 0) + { return MAKE_TOKEN(ERRORTOKEN); } } - if (c == 'j' || c == 'J') { + if (c == 'j' || c == 'J') + { /* Imaginary part */ - imaginary: + imaginary: c = tok_nextc(tok); - if (!verify_end_of_number(tok, c, "imaginary")) { + if (!verify_end_of_number(tok, c, "imaginary")) + { return MAKE_TOKEN(ERRORTOKEN); } } - else if (!verify_end_of_number(tok, c, "decimal")) { + else if (!verify_end_of_number(tok, c, "decimal")) + { return MAKE_TOKEN(ERRORTOKEN); } } @@ -2203,10 +2530,11 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t return MAKE_TOKEN(NUMBER); } - f_string_quote: - if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) { +f_string_quote: + if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) + { int quote = c; - int quote_size = 1; /* 1 or 3 */ + int quote_size = 1; /* 1 or 3 */ /* Nodes of type STRING, especially multi line strings must be handled differently in order to get both @@ -2217,22 +2545,25 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Find the quote size and start of string */ int after_quote = tok_nextc(tok); - if (after_quote == quote) { + if (after_quote == quote) + { int after_after_quote = tok_nextc(tok); - if (after_after_quote == quote) { + if (after_after_quote == quote) + { quote_size = 3; } - else { + else + { // TODO: Check this tok_backup(tok, after_after_quote); tok_backup(tok, after_quote); } } - if (after_quote != quote) { + if (after_quote != quote) + { tok_backup(tok, after_quote); } - p_start = tok->start; p_end = tok->cur; tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); @@ -2248,17 +2579,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t the_current_tok->last_expr_end = -1; the_current_tok->f_string_debug = 0; - switch (*tok->start) { - case 'F': - case 'f': - the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; - break; - case 'R': - case 'r': - the_current_tok->f_string_raw = 1; - break; - default: - Py_UNREACHABLE(); + switch (*tok->start) + { + case 'F': + case 'f': + the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; + break; + case 'R': + case 'r': + the_current_tok->f_string_raw = 1; + break; + default: + Py_UNREACHABLE(); } the_current_tok->curly_bracket_depth = 0; @@ -2266,11 +2598,12 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t return MAKE_TOKEN(FSTRING_START); } - letter_quote: +letter_quote: /* String */ - if (c == '\'' || c == '"') { + if (c == '\'' || c == '"') + { int quote = c; - int quote_size = 1; /* 1 or 3 */ + int quote_size = 1; /* 1 or 3 */ int end_quote_size = 0; /* Nodes of type STRING, especially multi line strings @@ -2282,25 +2615,31 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Find the quote size and start of string */ c = tok_nextc(tok); - if (c == quote) { + if (c == quote) + { c = tok_nextc(tok); - if (c == quote) { + if (c == quote) + { quote_size = 3; } - else { - end_quote_size = 1; /* empty string found */ + else + { + end_quote_size = 1; /* empty string found */ } } - if (c != quote) { + if (c != quote) + { tok_backup(tok, c); } /* Get rest of string */ - while (end_quote_size != quote_size) { + while (end_quote_size != quote_size) + { c = tok_nextc(tok); if (tok->done == E_DECODE) break; - if (c == EOF || (quote_size == 1 && c == '\n')) { + if (c == EOF || (quote_size == 1 && c == '\n')) + { assert(tok->multi_line_start != NULL); // shift the tok_state's location into // the start of string, and report the error @@ -2311,7 +2650,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t int start = tok->lineno; tok->lineno = tok->first_lineno; - if (INSIDE_FSTRING(tok)) { + if (INSIDE_FSTRING(tok)) + { /* When we are in an f-string, before raising the * unterminated string literal error, check whether * does the initial quote matches with f-strings quotes @@ -2319,35 +2659,45 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t * so raise the proper error */ tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); if (the_current_tok->f_string_quote == quote && - the_current_tok->f_string_quote_size == quote_size) { + the_current_tok->f_string_quote_size == quote_size) + { return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start)); } } - if (quote_size == 3) { + if (quote_size == 3) + { syntaxerror(tok, "unterminated triple-quoted string literal" - " (detected at line %d)", start); - if (c != '\n') { + " (detected at line %d)", + start); + if (c != '\n') + { tok->done = E_EOFS; } return MAKE_TOKEN(ERRORTOKEN); } - else { + else + { syntaxerror(tok, "unterminated string literal (detected at" - " line %d)", start); - if (c != '\n') { + " line %d)", + start); + if (c != '\n') + { tok->done = E_EOLS; } return MAKE_TOKEN(ERRORTOKEN); } } - if (c == quote) { + if (c == quote) + { end_quote_size += 1; } - else { + else + { end_quote_size = 0; - if (c == '\\') { - tok_nextc(tok); /* skip escaped char */ + if (c == '\\') + { + tok_nextc(tok); /* skip escaped char */ } } } @@ -2358,8 +2708,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } /* Line continuation */ - if (c == '\\') { - if ((c = tok_continuation_line(tok)) == -1) { + if (c == '\\') + { + if ((c = tok_continuation_line(tok)) == -1) + { return MAKE_TOKEN(ERRORTOKEN); } tok->cont_line = 1; @@ -2368,19 +2720,23 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Punctuation character */ int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); - if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { + if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) + { /* This code block gets executed before the curly_bracket_depth is incremented * by the `{` case, so for ensuring that we are on the 0th level, we need * to adjust it manually */ int cursor = current_tok->curly_bracket_depth - (c != '{'); - if (cursor == 0 && !update_fstring_expr(tok, c)) { + if (cursor == 0 && !update_fstring_expr(tok, c)) + { return MAKE_TOKEN(ENDMARKER); } - if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) { + if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) + { return MAKE_TOKEN(ERRORTOKEN); } - if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { + if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) + { current_tok->kind = TOK_FSTRING_MODE; p_start = tok->start; p_end = tok->cur; @@ -2392,13 +2748,16 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t { int c2 = tok_nextc(tok); int current_token = _PyToken_TwoChars(c, c2); - if (current_token != OP) { + if (current_token != OP) + { int c3 = tok_nextc(tok); int current_token3 = _PyToken_ThreeChars(c, c2, c3); - if (current_token3 != OP) { + if (current_token3 != OP) + { current_token = current_token3; } - else { + else + { tok_backup(tok, c3); } p_start = tok->start; @@ -2409,26 +2768,31 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } /* Keep track of parentheses nesting level */ - switch (c) { + switch (c) + { case '(': case '[': case '{': - if (tok->level >= MAXLEVEL || tok->tok_mode_stack_index + 1 >= MAXLEVEL) { + if (tok->level >= MAXLEVEL || tok->tok_mode_stack_index + 1 >= MAXLEVEL) + { return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses")); } tok->parenstack[tok->level] = c; tok->parenlinenostack[tok->level] = tok->lineno; tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); tok->level++; - if (INSIDE_FSTRING(tok)) { + if (INSIDE_FSTRING(tok)) + { current_tok->curly_bracket_depth++; } break; case ')': case ']': case '}': - if (!tok->level) { - if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { + if (!tok->level) + { + if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') + { return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed")); } return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c)); @@ -2444,30 +2808,36 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t nested expression, then instead of matching a different syntactical construct with it; we'll throw an unmatched parentheses error. */ - if (INSIDE_FSTRING(tok) && opening == '{') { + if (INSIDE_FSTRING(tok) && opening == '{') + { assert(current_tok->curly_bracket_depth >= 0); int previous_bracket = current_tok->curly_bracket_depth - 1; - if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { + if (previous_bracket == current_tok->curly_bracket_expr_start_depth) + { return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c)); } } - if (tok->parenlinenostack[tok->level] != tok->lineno) { + if (tok->parenlinenostack[tok->level] != tok->lineno) + { return MAKE_TOKEN(syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c' on line %d", - c, opening, tok->parenlinenostack[tok->level])); + "closing parenthesis '%c' does not match " + "opening parenthesis '%c' on line %d", + c, opening, tok->parenlinenostack[tok->level])); } - else { + else + { return MAKE_TOKEN(syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c'", - c, opening)); + "closing parenthesis '%c' does not match " + "opening parenthesis '%c'", + c, opening)); } } - if (INSIDE_FSTRING(tok)) { + if (INSIDE_FSTRING(tok)) + { current_tok->curly_bracket_depth--; - if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { + if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) + { current_tok->curly_bracket_expr_start_depth--; current_tok->kind = TOK_FSTRING_MODE; current_tok->f_string_debug = 0; @@ -2478,13 +2848,15 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t break; } - if (!Py_UNICODE_ISPRINTABLE(c)) { + if (!Py_UNICODE_ISPRINTABLE(c)) + { char hex[9]; (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c); return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex)); } - if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { + if (c == '=' && INSIDE_FSTRING_EXPR(current_tok)) + { current_tok->f_string_debug = 1; } @@ -2495,7 +2867,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } static int -tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) +tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct token *token) { const char *p_start = NULL; const char *p_end = NULL; @@ -2509,33 +2881,40 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize // before it. int start_char = tok_nextc(tok); - if (start_char == '{') { + if (start_char == '{') + { int peek1 = tok_nextc(tok); tok_backup(tok, peek1); tok_backup(tok, start_char); - if (peek1 != '{') { + if (peek1 != '{') + { current_tok->curly_bracket_expr_start_depth++; - if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { + if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) + { return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); } TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; return tok_get_normal_mode(tok, current_tok, token); } } - else { + else + { tok_backup(tok, start_char); } // Check if we are at the end of the string - for (int i = 0; i < current_tok->f_string_quote_size; i++) { + for (int i = 0; i < current_tok->f_string_quote_size; i++) + { int quote = tok_nextc(tok); - if (quote != current_tok->f_string_quote) { + if (quote != current_tok->f_string_quote) + { tok_backup(tok, quote); goto f_string_middle; } } - if (current_tok->last_expr_buffer != NULL) { + if (current_tok->last_expr_buffer != NULL) + { PyMem_Free(current_tok->last_expr_buffer); current_tok->last_expr_buffer = NULL; current_tok->last_expr_size = 0; @@ -2549,10 +2928,13 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct f_string_middle: - while (end_quote_size != current_tok->f_string_quote_size) { + while (end_quote_size != current_tok->f_string_quote_size) + { int c = tok_nextc(tok); - if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) { - if (tok->decoding_erred) { + if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) + { + if (tok->decoding_erred) + { return MAKE_TOKEN(ERRORTOKEN); } @@ -2566,49 +2948,61 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct int start = tok->lineno; tok->lineno = tok->first_lineno; - if (current_tok->f_string_quote_size == 3) { + if (current_tok->f_string_quote_size == 3) + { return MAKE_TOKEN(syntaxerror(tok, - "unterminated triple-quoted f-string literal" - " (detected at line %d)", start)); + "unterminated triple-quoted f-string literal" + " (detected at line %d)", + start)); } - else { + else + { return MAKE_TOKEN(syntaxerror(tok, - "unterminated f-string literal (detected at" - " line %d)", start)); + "unterminated f-string literal (detected at" + " line %d)", + start)); } } - if (c == current_tok->f_string_quote) { + if (c == current_tok->f_string_quote) + { end_quote_size += 1; continue; - } else { + } + else + { end_quote_size = 0; } - int in_format_spec = ( - current_tok->last_expr_end != -1 - && - INSIDE_FSTRING_EXPR(current_tok) - ); - if (c == '{') { + int in_format_spec = (current_tok->last_expr_end != -1 && + INSIDE_FSTRING_EXPR(current_tok)); + if (c == '{') + { int peek = tok_nextc(tok); - if (peek != '{' || in_format_spec) { + if (peek != '{' || in_format_spec) + { tok_backup(tok, peek); tok_backup(tok, c); current_tok->curly_bracket_expr_start_depth++; - if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { + if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) + { return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); } TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; p_start = tok->start; p_end = tok->cur; - } else { + } + else + { p_start = tok->start; p_end = tok->cur - 1; } return MAKE_TOKEN(FSTRING_MIDDLE); - } else if (c == '}') { - if (unicode_escape) { + } + else if (c == '}') + { + if (unicode_escape) + { p_start = tok->start; p_end = tok->cur; return MAKE_TOKEN(FSTRING_MIDDLE); @@ -2619,10 +3013,13 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct // scanning (indicated by the end of the expression being set) and we are not at the top level // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double // brackets, we can bypass it here. - if (peek == '}' && !in_format_spec) { + if (peek == '}' && !in_format_spec) + { p_start = tok->start; p_end = tok->cur - 1; - } else { + } + else + { tok_backup(tok, peek); tok_backup(tok, c); TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; @@ -2630,14 +3027,19 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct p_end = tok->cur; } return MAKE_TOKEN(FSTRING_MIDDLE); - } else if (c == '\\') { + } + else if (c == '\\') + { int peek = tok_nextc(tok); // Special case when the backslash is right before a curly // brace. We have to restore and return the control back // to the loop for the next iteration. - if (peek == '{' || peek == '}') { - if (!current_tok->f_string_raw) { - if (warn_invalid_escape_sequence(tok, peek)) { + if (peek == '{' || peek == '}') + { + if (!current_tok->f_string_raw) + { + if (warn_invalid_escape_sequence(tok, peek)) + { return MAKE_TOKEN(ERRORTOKEN); } } @@ -2645,13 +3047,18 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct continue; } - if (!current_tok->f_string_raw) { - if (peek == 'N') { + if (!current_tok->f_string_raw) + { + if (peek == 'N') + { /* Handle named unicode escapes (\N{BULLET}) */ peek = tok_nextc(tok); - if (peek == '{') { + if (peek == '{') + { unicode_escape = 1; - } else { + } + else + { tok_backup(tok, peek); } } @@ -2663,7 +3070,8 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct // Backup the f-string quotes to emit a final FSTRING_MIDDLE and // add the quotes to the FSTRING_END in the next tokenizer iteration. - for (int i = 0; i < current_tok->f_string_quote_size; i++) { + for (int i = 0; i < current_tok->f_string_quote_size; i++) + { tok_backup(tok, current_tok->f_string_quote); } p_start = tok->start; @@ -2671,23 +3079,25 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct return MAKE_TOKEN(FSTRING_MIDDLE); } - static int tok_get(struct tok_state *tok, struct token *token) { tokenizer_mode *current_tok = TOK_GET_MODE(tok); - if (current_tok->kind == TOK_REGULAR_MODE) { + if (current_tok->kind == TOK_REGULAR_MODE) + { return tok_get_normal_mode(tok, current_tok, token); - } else { + } + else + { return tok_get_fstring_mode(tok, current_tok, token); } } -int -_PyTokenizer_Get(struct tok_state *tok, struct token *token) +int _PyTokenizer_Get(struct tok_state *tok, struct token *token) { int result = tok_get(tok, token); - if (tok->decoding_erred) { + if (tok->decoding_erred) + { result = ERRORTOKEN; tok->done = E_DECODE; } @@ -2697,7 +3107,8 @@ _PyTokenizer_Get(struct tok_state *tok, struct token *token) #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3)) // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's // dup() emulation with open() is slow. -typedef union { +typedef union +{ void *cookie; int fd; } borrowed; @@ -2710,7 +3121,8 @@ borrow_read(void *cookie, char *buf, size_t size) } static FILE * -fdopen_borrow(int fd) { +fdopen_borrow(int fd) +{ // supports only reading. seek fails. close and write are no-ops. cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL}; borrowed b = {.fd = fd}; @@ -2718,9 +3130,11 @@ fdopen_borrow(int fd) { } #else static FILE * -fdopen_borrow(int fd) { +fdopen_borrow(int fd) +{ fd = _Py_dup(fd); - if (fd < 0) { + if (fd < 0) + { return NULL; } return fdopen(fd, "r"); @@ -2745,20 +3159,25 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) char *encoding = NULL; fp = fdopen_borrow(fd); - if (fp == NULL) { + if (fp == NULL) + { return NULL; } tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL); - if (tok == NULL) { + if (tok == NULL) + { fclose(fp); return NULL; } - if (filename != NULL) { + if (filename != NULL) + { tok->filename = Py_NewRef(filename); } - else { + else + { tok->filename = PyUnicode_FromString(""); - if (tok->filename == NULL) { + if (tok->filename == NULL) + { fclose(fp); _PyTokenizer_Free(tok); return encoding; @@ -2768,13 +3187,16 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) // We don't want to report warnings here because it could cause infinite recursion // if fetching the encoding shows a warning. tok->report_warnings = 0; - while (tok->lineno < 2 && tok->done == E_OK) { + while (tok->lineno < 2 && tok->done == E_OK) + { _PyTokenizer_Get(tok, &token); } fclose(fp); - if (tok->encoding) { + if (tok->encoding) + { encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1); - if (encoding) { + if (encoding) + { strcpy(encoding, tok->encoding); } } @@ -2783,11 +3205,10 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) } #ifdef Py_DEBUG -void -tok_dump(int type, char *start, char *end) +void tok_dump(int type, char *start, char *end) { fprintf(stderr, "%s", _PyParser_TokenNames[type]); if (type == NAME || type == NUMBER || type == STRING || type == OP) fprintf(stderr, "(%.*s)", (int)(end - start), start); } -#endif // Py_DEBUG +#endif // Py_DEBUG From d07ddb41fe7aee8a8b8338a52dc905de346e8fe8 Mon Sep 17 00:00:00 2001 From: jx124 <64946984+jx124@users.noreply.github.com> Date: Tue, 2 May 2023 01:06:34 +0800 Subject: [PATCH 5/8] Undo accidental reformat --- Parser/tokenizer.c | 1595 ++++++++++++++++---------------------------- 1 file changed, 587 insertions(+), 1008 deletions(-) diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 6a10bf9c2cad5f..1f0e8362f9e21c 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -3,7 +3,7 @@ #define PY_SSIZE_T_CLEAN #include "Python.h" -#include "pycore_call.h" // _PyObject_CallNoArgs() +#include "pycore_call.h" // _PyObject_CallNoArgs() #include #include @@ -14,33 +14,39 @@ /* Alternate tab spacing */ #define ALTTABSIZE 1 -#define is_potential_identifier_start(c) ( \ - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c >= 128)) +#define is_potential_identifier_start(c) (\ + (c >= 'a' && c <= 'z')\ + || (c >= 'A' && c <= 'Z')\ + || c == '_'\ + || (c >= 128)) + +#define is_potential_identifier_char(c) (\ + (c >= 'a' && c <= 'z')\ + || (c >= 'A' && c <= 'Z')\ + || (c >= '0' && c <= '9')\ + || c == '_'\ + || (c >= 128)) -#define is_potential_identifier_char(c) ( \ - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || (c >= 128)) /* Don't ever change this -- it would break the portability of Python code */ #define TABSIZE 8 #define MAKE_TOKEN(token_type) token_setup(tok, token, token_type, p_start, p_end) -#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) ( \ - type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) +#define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\ + type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end)) #define ADVANCE_LINENO() \ - tok->lineno++; \ - tok->col_offset = 0; + tok->lineno++; \ + tok->col_offset = 0; #define INSIDE_FSTRING(tok) (tok->tok_mode_stack_index > 0) #define INSIDE_FSTRING_EXPR(tok) (tok->curly_bracket_expr_start_depth >= 0) #ifdef Py_DEBUG -static inline tokenizer_mode *TOK_GET_MODE(struct tok_state *tok) -{ +static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index >= 0); assert(tok->tok_mode_stack_index < MAXLEVEL); return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); } -static inline tokenizer_mode *TOK_NEXT_MODE(struct tok_state *tok) -{ +static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index >= 0); assert(tok->tok_mode_stack_index + 1 < MAXLEVEL); return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); @@ -58,7 +64,7 @@ static int syntaxerror(struct tok_state *tok, const char *format, ...); /* Spaces in this constant are treated as "zero or more spaces or tabs" when tokenizing. */ -static const char *type_comment_prefix = "# type: "; +static const char* type_comment_prefix = "# type: "; /* Create and initialize a new tok_state structure */ @@ -66,7 +72,7 @@ static struct tok_state * tok_new(void) { struct tok_state *tok = (struct tok_state *)PyMem_Malloc( - sizeof(struct tok_state)); + sizeof(struct tok_state)); if (tok == NULL) return NULL; tok->buf = tok->cur = tok->inp = NULL; @@ -105,7 +111,7 @@ tok_new(void) tok->interactive_underflow = IUNDERFLOW_NORMAL; tok->str = NULL; tok->report_warnings = 1; - tok->tok_mode_stack[0] = (tokenizer_mode){.kind = TOK_REGULAR_MODE, .f_string_quote = '\0', .f_string_quote_size = 0, .f_string_debug = 0}; + tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0, .f_string_debug=0}; tok->tok_mode_stack_index = 0; tok->tok_report_warnings = 1; #ifdef Py_DEBUG @@ -117,9 +123,8 @@ tok_new(void) static char * new_string(const char *s, Py_ssize_t len, struct tok_state *tok) { - char *result = (char *)PyMem_Malloc(len + 1); - if (!result) - { + char* result = (char *)PyMem_Malloc(len + 1); + if (!result) { tok->done = E_NOMEM; return NULL; } @@ -138,16 +143,16 @@ error_ret(struct tok_state *tok) /* XXX */ tok->start = NULL; tok->end = NULL; tok->done = E_DECODE; - return NULL; /* as if it were EOF */ + return NULL; /* as if it were EOF */ } + static const char * -get_normal_name(const char *s) /* for utf-8 and latin-1 */ +get_normal_name(const char *s) /* for utf-8 and latin-1 */ { char buf[13]; int i; - for (i = 0; i < 12; i++) - { + for (i = 0; i < 12; i++) { int c = s[i]; if (c == '\0') break; @@ -180,24 +185,20 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t *spec = NULL; /* Coding spec must be in a comment, and that comment must be * the only statement on the source code line. */ - for (i = 0; i < size - 6; i++) - { + for (i = 0; i < size - 6; i++) { if (s[i] == '#') break; if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014') return 1; } - for (; i < size - 6; i++) - { /* XXX inefficient search */ - const char *t = s + i; - if (memcmp(t, "coding", 6) == 0) - { - const char *begin = NULL; + for (; i < size - 6; i++) { /* XXX inefficient search */ + const char* t = s + i; + if (memcmp(t, "coding", 6) == 0) { + const char* begin = NULL; t += 6; if (t[0] != ':' && t[0] != '=') continue; - do - { + do { t++; } while (t[0] == ' ' || t[0] == '\t'); @@ -206,15 +207,13 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t t[0] == '-' || t[0] == '_' || t[0] == '.') t++; - if (begin < t) - { - char *r = new_string(begin, t - begin, tok); - const char *q; + if (begin < t) { + char* r = new_string(begin, t - begin, tok); + const char* q; if (!r) return 0; q = get_normal_name(r); - if (r != q) - { + if (r != q) { PyMem_Free(r); r = new_string(q, strlen(q), tok); if (!r) @@ -234,29 +233,24 @@ get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *t Return 1 on success, 0 on failure. */ static int -check_coding_spec(const char *line, Py_ssize_t size, struct tok_state *tok, +check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok, int set_readline(struct tok_state *, const char *)) { char *cs; - if (tok->cont_line) - { + if (tok->cont_line) { /* It's a continuation line, so it can't be a coding spec. */ tok->decoding_state = STATE_NORMAL; return 1; } - if (!get_coding_spec(line, &cs, size, tok)) - { + if (!get_coding_spec(line, &cs, size, tok)) { return 0; } - if (!cs) - { + if (!cs) { Py_ssize_t i; - for (i = 0; i < size; i++) - { + for (i = 0; i < size; i++) { if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') break; - if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') - { + if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { /* Stop checking coding spec after a line containing * anything except a comment. */ tok->decoding_state = STATE_NORMAL; @@ -266,22 +260,17 @@ check_coding_spec(const char *line, Py_ssize_t size, struct tok_state *tok, return 1; } tok->decoding_state = STATE_NORMAL; - if (tok->encoding == NULL) - { + if (tok->encoding == NULL) { assert(tok->decoding_readline == NULL); - if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) - { + if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) { error_ret(tok); PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs); PyMem_Free(cs); return 0; } tok->encoding = cs; - } - else - { /* then, compare cs with BOM */ - if (strcmp(tok->encoding, cs) != 0) - { + } else { /* then, compare cs with BOM */ + if (strcmp(tok->encoding, cs) != 0) { error_ret(tok); PyErr_Format(PyExc_SyntaxError, "encoding problem: %s with BOM", cs); @@ -306,30 +295,23 @@ check_bom(int get_char(struct tok_state *), int ch1, ch2, ch3; ch1 = get_char(tok); tok->decoding_state = STATE_SEEK_CODING; - if (ch1 == EOF) - { + if (ch1 == EOF) { return 1; - } - else if (ch1 == 0xEF) - { + } else if (ch1 == 0xEF) { ch2 = get_char(tok); - if (ch2 != 0xBB) - { + if (ch2 != 0xBB) { unget_char(ch2, tok); unget_char(ch1, tok); return 1; } ch3 = get_char(tok); - if (ch3 != 0xBF) - { + if (ch3 != 0xBF) { unget_char(ch3, tok); unget_char(ch2, tok); unget_char(ch1, tok); return 1; } - } - else - { + } else { unget_char(ch1, tok); return 1; } @@ -343,29 +325,24 @@ check_bom(int get_char(struct tok_state *), } static int -tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) -{ +tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) { assert(tok->fp_interactive); - if (!line) - { + if (!line) { return 0; } Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start; Py_ssize_t line_size = strlen(line); char last_char = line[line_size > 0 ? line_size - 1 : line_size]; - if (last_char != '\n') - { + if (last_char != '\n') { line_size += 1; } - char *new_str = tok->interactive_src_start; + char* new_str = tok->interactive_src_start; new_str = PyMem_Realloc(new_str, current_size + line_size + 1); - if (!new_str) - { - if (tok->interactive_src_start) - { + if (!new_str) { + if (tok->interactive_src_start) { PyMem_Free(tok->interactive_src_start); } tok->interactive_src_start = NULL; @@ -374,8 +351,7 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) return -1; } strcpy(new_str + current_size, line); - if (last_char != '\n') - { + if (last_char != '\n') { /* Last line does not end in \n, fake one */ new_str[current_size + line_size - 1] = '\n'; new_str[current_size + line_size] = '\0'; @@ -393,8 +369,7 @@ remember_fstring_buffers(struct tok_state *tok) int index; tokenizer_mode *mode; - for (index = tok->tok_mode_stack_index; index >= 0; --index) - { + for (index = tok->tok_mode_stack_index; index >= 0; --index) { mode = &(tok->tok_mode_stack[index]); mode->f_string_start_offset = mode->f_string_start - tok->buf; mode->f_string_multi_line_start_offset = mode->f_string_multi_line_start - tok->buf; @@ -408,8 +383,7 @@ restore_fstring_buffers(struct tok_state *tok) int index; tokenizer_mode *mode; - for (index = tok->tok_mode_stack_index; index >= 0; --index) - { + for (index = tok->tok_mode_stack_index; index >= 0; --index) { mode = &(tok->tok_mode_stack[index]); mode->f_string_start = tok->buf + mode->f_string_start_offset; mode->f_string_multi_line_start = tok->buf + mode->f_string_multi_line_start_offset; @@ -417,23 +391,21 @@ restore_fstring_buffers(struct tok_state *tok) } static int -set_fstring_expr(struct tok_state *tok, struct token *token, char c) -{ +set_fstring_expr(struct tok_state* tok, struct token *token, char c) { assert(token != NULL); assert(c == '}' || c == ':' || c == '!'); tokenizer_mode *tok_mode = TOK_GET_MODE(tok); - if (!tok_mode->f_string_debug || token->metadata) - { + if (!tok_mode->f_string_debug || token->metadata) { return 0; } PyObject *res = PyUnicode_DecodeUTF8( tok_mode->last_expr_buffer, tok_mode->last_expr_size - tok_mode->last_expr_end, - NULL); - if (!res) - { + NULL + ); + if (!res) { return -1; } token->metadata = res; @@ -448,49 +420,44 @@ update_fstring_expr(struct tok_state *tok, char cur) Py_ssize_t size = strlen(tok->cur); tokenizer_mode *tok_mode = TOK_GET_MODE(tok); - switch (cur) - { - case 0: - if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) - { - return 1; - } - char *new_buffer = PyMem_Realloc( - tok_mode->last_expr_buffer, - tok_mode->last_expr_size + size); - if (new_buffer == NULL) - { - PyMem_Free(tok_mode->last_expr_buffer); - goto error; - } - tok_mode->last_expr_buffer = new_buffer; - strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); - tok_mode->last_expr_size += size; - break; - case '{': - if (tok_mode->last_expr_buffer != NULL) - { - PyMem_Free(tok_mode->last_expr_buffer); - } - tok_mode->last_expr_buffer = PyMem_Malloc(size); - if (tok_mode->last_expr_buffer == NULL) - { - goto error; - } - tok_mode->last_expr_size = size; - tok_mode->last_expr_end = -1; - strncpy(tok_mode->last_expr_buffer, tok->cur, size); - break; - case '}': - case '!': - case ':': - if (tok_mode->last_expr_end == -1) - { - tok_mode->last_expr_end = strlen(tok->start); - } - break; - default: - Py_UNREACHABLE(); + switch (cur) { + case 0: + if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) { + return 1; + } + char *new_buffer = PyMem_Realloc( + tok_mode->last_expr_buffer, + tok_mode->last_expr_size + size + ); + if (new_buffer == NULL) { + PyMem_Free(tok_mode->last_expr_buffer); + goto error; + } + tok_mode->last_expr_buffer = new_buffer; + strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size); + tok_mode->last_expr_size += size; + break; + case '{': + if (tok_mode->last_expr_buffer != NULL) { + PyMem_Free(tok_mode->last_expr_buffer); + } + tok_mode->last_expr_buffer = PyMem_Malloc(size); + if (tok_mode->last_expr_buffer == NULL) { + goto error; + } + tok_mode->last_expr_size = size; + tok_mode->last_expr_end = -1; + strncpy(tok_mode->last_expr_buffer, tok->cur, size); + break; + case '}': + case '!': + case ':': + if (tok_mode->last_expr_end == -1) { + tok_mode->last_expr_end = strlen(tok->start); + } + break; + default: + Py_UNREACHABLE(); } return 1; error: @@ -504,11 +471,9 @@ free_fstring_expressions(struct tok_state *tok) int index; tokenizer_mode *mode; - for (index = tok->tok_mode_stack_index; index >= 0; --index) - { + for (index = tok->tok_mode_stack_index; index >= 0; --index) { mode = &(tok->tok_mode_stack[index]); - if (mode->last_expr_buffer != NULL) - { + if (mode->last_expr_buffer != NULL) { PyMem_Free(mode->last_expr_buffer); mode->last_expr_buffer = NULL; mode->last_expr_size = 0; @@ -538,16 +503,14 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) Py_ssize_t cur = tok->cur - tok->buf; Py_ssize_t oldsize = tok->inp - tok->buf; Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1); - if (newsize > tok->end - tok->buf) - { + if (newsize > tok->end - tok->buf) { char *newbuf = tok->buf; Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf; Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf; Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf; remember_fstring_buffers(tok); newbuf = (char *)PyMem_Realloc(newbuf, newsize); - if (newbuf == NULL) - { + if (newbuf == NULL) { tok->done = E_NOMEM; return 0; } @@ -564,34 +527,28 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size) } static inline int -contains_null_bytes(const char *str, size_t size) -{ +contains_null_bytes(const char* str, size_t size) { return memchr(str, 0, size) != NULL; } static int -tok_readline_recode(struct tok_state *tok) -{ +tok_readline_recode(struct tok_state *tok) { PyObject *line; - const char *buf; + const char *buf; Py_ssize_t buflen; line = tok->decoding_buffer; - if (line == NULL) - { + if (line == NULL) { line = PyObject_CallNoArgs(tok->decoding_readline); - if (line == NULL) - { + if (line == NULL) { error_ret(tok); goto error; } } - else - { + else { tok->decoding_buffer = NULL; } buf = PyUnicode_AsUTF8AndSize(line, &buflen); - if (buf == NULL) - { + if (buf == NULL) { error_ret(tok); goto error; } @@ -599,16 +556,14 @@ tok_readline_recode(struct tok_state *tok) // an extra newline character that we may need to artificially // add. size_t buffer_size = buflen + 2; - if (!tok_reserve_buf(tok, buffer_size)) - { + if (!tok_reserve_buf(tok, buffer_size)) { goto error; } memcpy(tok->inp, buf, buflen); tok->inp += buflen; *tok->inp = '\0'; if (tok->fp_interactive && - tok_concatenate_interactive_new_line(tok, buf) == -1) - { + tok_concatenate_interactive_new_line(tok, buf) == -1) { goto error; } Py_DECREF(line); @@ -629,7 +584,7 @@ tok_readline_recode(struct tok_state *tok) Return 1 on success, 0 on failure. */ static int -fp_setreadl(struct tok_state *tok, const char *enc) +fp_setreadl(struct tok_state *tok, const char* enc) { PyObject *readline, *open, *stream; int fd; @@ -643,38 +598,32 @@ fp_setreadl(struct tok_state *tok, const char *enc) * the end of line.*/ pos = ftell(tok->fp); if (pos == -1 || - lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) - { + lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) { PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); return 0; } open = _PyImport_GetModuleAttrString("io", "open"); - if (open == NULL) - { + if (open == NULL) { return 0; } stream = PyObject_CallFunction(open, "isisOOO", - fd, "r", -1, enc, Py_None, Py_None, Py_False); + fd, "r", -1, enc, Py_None, Py_None, Py_False); Py_DECREF(open); - if (stream == NULL) - { + if (stream == NULL) { return 0; } readline = PyObject_GetAttr(stream, &_Py_ID(readline)); Py_DECREF(stream); - if (readline == NULL) - { + if (readline == NULL) { return 0; } Py_XSETREF(tok->decoding_readline, readline); - if (pos > 0) - { + if (pos > 0) { PyObject *bufobj = _PyObject_CallNoArgs(readline); - if (bufobj == NULL) - { + if (bufobj == NULL) { return 0; } Py_DECREF(bufobj); @@ -685,15 +634,13 @@ fp_setreadl(struct tok_state *tok, const char *enc) /* Fetch the next byte from TOK. */ -static int fp_getc(struct tok_state *tok) -{ +static int fp_getc(struct tok_state *tok) { return getc(tok->fp); } /* Unfetch the last byte back into TOK. */ -static void fp_ungetc(int c, struct tok_state *tok) -{ +static void fp_ungetc(int c, struct tok_state *tok) { ungetc(c, tok->fp); } @@ -703,20 +650,17 @@ static void fp_ungetc(int c, struct tok_state *tok) those in stringlib/codecs.h:utf8_decode. */ static int -valid_utf8(const unsigned char *s) +valid_utf8(const unsigned char* s) { int expected = 0; int length; - if (*s < 0x80) - { + if (*s < 0x80) { /* single-byte code */ return 1; } - else if (*s < 0xE0) - { + else if (*s < 0xE0) { /* \xC2\x80-\xDF\xBF -- 0080-07FF */ - if (*s < 0xC2) - { + if (*s < 0xC2) { /* invalid sequence \x80-\xBF -- continuation byte \xC0-\xC1 -- fake 0000-007F */ @@ -724,17 +668,14 @@ valid_utf8(const unsigned char *s) } expected = 1; } - else if (*s < 0xF0) - { + else if (*s < 0xF0) { /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */ - if (*s == 0xE0 && *(s + 1) < 0xA0) - { + if (*s == 0xE0 && *(s + 1) < 0xA0) { /* invalid sequence \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */ return 0; } - else if (*s == 0xED && *(s + 1) >= 0xA0) - { + else if (*s == 0xED && *(s + 1) >= 0xA0) { /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF will result in surrogates in range D800-DFFF. Surrogates are not valid UTF-8 so they are rejected. @@ -744,11 +685,9 @@ valid_utf8(const unsigned char *s) } expected = 2; } - else if (*s < 0xF5) - { + else if (*s < 0xF5) { /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */ - if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) - { + if (*(s + 1) < 0x90 ? *s == 0xF0 : *s == 0xF4) { /* invalid sequence -- one of: \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF \xF4\x90\x80\x80- -- 110000- overflow */ @@ -756,8 +695,7 @@ valid_utf8(const unsigned char *s) } expected = 3; } - else - { + else { /* invalid start byte */ return 0; } @@ -774,16 +712,13 @@ ensure_utf8(char *line, struct tok_state *tok) int badchar = 0; unsigned char *c; int length; - for (c = (unsigned char *)line; *c; c += length) - { - if (!(length = valid_utf8(c))) - { + for (c = (unsigned char *)line; *c; c += length) { + if (!(length = valid_utf8(c))) { badchar = *c; break; } } - if (badchar) - { + if (badchar) { PyErr_Format(PyExc_SyntaxError, "Non-UTF-8 code starting with '\\x%.2x' " "in file %U on line %i, " @@ -798,26 +733,23 @@ ensure_utf8(char *line, struct tok_state *tok) /* Fetch a byte from TOK, using the string buffer. */ static int -buf_getc(struct tok_state *tok) -{ +buf_getc(struct tok_state *tok) { return Py_CHARMASK(*tok->str++); } /* Unfetch a byte from TOK, using the string buffer. */ static void -buf_ungetc(int c, struct tok_state *tok) -{ +buf_ungetc(int c, struct tok_state *tok) { tok->str--; - assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ + assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */ } /* Set the readline function for TOK to ENC. For the string-based tokenizer, this means to just record the encoding. */ static int -buf_setreadl(struct tok_state *tok, const char *enc) -{ +buf_setreadl(struct tok_state *tok, const char* enc) { tok->enc = enc; return 1; } @@ -826,10 +758,9 @@ buf_setreadl(struct tok_state *tok, const char *enc) C byte string STR, which is encoded with ENC. */ static PyObject * -translate_into_utf8(const char *str, const char *enc) -{ +translate_into_utf8(const char* str, const char* enc) { PyObject *utf8; - PyObject *buf = PyUnicode_Decode(str, strlen(str), enc, NULL); + PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL); if (buf == NULL) return NULL; utf8 = PyUnicode_AsUTF8String(buf); @@ -837,34 +768,29 @@ translate_into_utf8(const char *str, const char *enc) return utf8; } + static char * -translate_newlines(const char *s, int exec_input, struct tok_state *tok) -{ +translate_newlines(const char *s, int exec_input, struct tok_state *tok) { int skip_next_lf = 0; size_t needed_length = strlen(s) + 2, final_length; char *buf, *current; char c = '\0'; buf = PyMem_Malloc(needed_length); - if (buf == NULL) - { + if (buf == NULL) { tok->done = E_NOMEM; return NULL; } - for (current = buf; *s; s++, current++) - { + for (current = buf; *s; s++, current++) { c = *s; - if (skip_next_lf) - { + if (skip_next_lf) { skip_next_lf = 0; - if (c == '\n') - { + if (c == '\n') { c = *++s; if (!c) break; } } - if (c == '\r') - { + if (c == '\r') { skip_next_lf = 1; c = '\n'; } @@ -872,19 +798,16 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) } /* If this is exec input, add a newline to the end of the string if there isn't one already. */ - if (exec_input && c != '\n') - { + if (exec_input && c != '\n') { *current = '\n'; current++; } *current = '\0'; final_length = current - buf + 1; - if (final_length < needed_length && final_length) - { + if (final_length < needed_length && final_length) { /* should never fail */ - char *result = PyMem_Realloc(buf, final_length); - if (result == NULL) - { + char* result = PyMem_Realloc(buf, final_length); + if (result == NULL) { PyMem_Free(buf); } buf = result; @@ -899,7 +822,7 @@ translate_newlines(const char *s, int exec_input, struct tok_state *tok) static char * decode_str(const char *input, int single, struct tok_state *tok) { - PyObject *utf8 = NULL; + PyObject* utf8 = NULL; char *str; const char *s; const char *newl[2] = {NULL, NULL}; @@ -911,46 +834,37 @@ decode_str(const char *input, int single, struct tok_state *tok) tok->str = str; if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok)) return error_ret(tok); - str = tok->str; /* string after BOM if any */ + str = tok->str; /* string after BOM if any */ assert(str); - if (tok->enc != NULL) - { + if (tok->enc != NULL) { utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) return error_ret(tok); str = PyBytes_AsString(utf8); } - for (s = str;; s++) - { - if (*s == '\0') - break; - else if (*s == '\n') - { + for (s = str;; s++) { + if (*s == '\0') break; + else if (*s == '\n') { assert(lineno < 2); newl[lineno] = s; lineno++; - if (lineno == 2) - break; + if (lineno == 2) break; } } tok->enc = NULL; /* need to check line 1 and 2 separately since check_coding_spec assumes a single line as input */ - if (newl[0]) - { - if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) - { + if (newl[0]) { + if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) { return NULL; } - if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) - { - if (!check_coding_spec(newl[0] + 1, newl[1] - newl[0], + if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) { + if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], tok, buf_setreadl)) return NULL; } } - if (tok->enc != NULL) - { + if (tok->enc != NULL) { assert(utf8 == NULL); utf8 = translate_into_utf8(str, tok->enc); if (utf8 == NULL) @@ -973,8 +887,7 @@ _PyTokenizer_FromString(const char *str, int exec_input) if (tok == NULL) return NULL; decoded = decode_str(str, exec_input, tok); - if (decoded == NULL) - { + if (decoded == NULL) { _PyTokenizer_Free(tok); return NULL; } @@ -994,8 +907,7 @@ _PyTokenizer_FromUTF8(const char *str, int exec_input) if (tok == NULL) return NULL; tok->input = translated = translate_newlines(str, exec_input, tok); - if (translated == NULL) - { + if (translated == NULL) { _PyTokenizer_Free(tok); return NULL; } @@ -1003,8 +915,7 @@ _PyTokenizer_FromUTF8(const char *str, int exec_input) tok->enc = NULL; tok->str = translated; tok->encoding = new_string("utf-8", 5, tok); - if (!tok->encoding) - { + if (!tok->encoding) { _PyTokenizer_Free(tok); return NULL; } @@ -1017,14 +928,13 @@ _PyTokenizer_FromUTF8(const char *str, int exec_input) /* Set up tokenizer for file */ struct tok_state * -_PyTokenizer_FromFile(FILE *fp, const char *enc, +_PyTokenizer_FromFile(FILE *fp, const char* enc, const char *ps1, const char *ps2) { struct tok_state *tok = tok_new(); if (tok == NULL) return NULL; - if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) - { + if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) { _PyTokenizer_Free(tok); return NULL; } @@ -1033,13 +943,11 @@ _PyTokenizer_FromFile(FILE *fp, const char *enc, tok->fp = fp; tok->prompt = ps1; tok->nextprompt = ps2; - if (enc != NULL) - { + if (enc != NULL) { /* Must copy encoding declaration since it gets copied into the parse tree. */ tok->encoding = new_string(enc, strlen(enc), tok); - if (!tok->encoding) - { + if (!tok->encoding) { _PyTokenizer_Free(tok); return NULL; } @@ -1050,25 +958,22 @@ _PyTokenizer_FromFile(FILE *fp, const char *enc, /* Free a tok_state structure */ -void _PyTokenizer_Free(struct tok_state *tok) +void +_PyTokenizer_Free(struct tok_state *tok) { - if (tok->encoding != NULL) - { + if (tok->encoding != NULL) { PyMem_Free(tok->encoding); } Py_XDECREF(tok->decoding_readline); Py_XDECREF(tok->decoding_buffer); Py_XDECREF(tok->filename); - if (tok->fp != NULL && tok->buf != NULL) - { + if (tok->fp != NULL && tok->buf != NULL) { PyMem_Free(tok->buf); } - if (tok->input) - { + if (tok->input) { PyMem_Free(tok->input); } - if (tok->interactive_src_start != NULL) - { + if (tok->interactive_src_start != NULL) { PyMem_Free(tok->interactive_src_start); } free_fstring_expressions(tok); @@ -1078,31 +983,25 @@ void _PyTokenizer_Free(struct tok_state *tok) static int tok_readline_raw(struct tok_state *tok) { - do - { - if (!tok_reserve_buf(tok, BUFSIZ)) - { + do { + if (!tok_reserve_buf(tok, BUFSIZ)) { return 0; } int n_chars = (int)(tok->end - tok->inp); size_t line_size = 0; char *line = _Py_UniversalNewlineFgetsWithSize(tok->inp, n_chars, tok->fp, NULL, &line_size); - if (line == NULL) - { + if (line == NULL) { return 1; } - if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) - { + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { return 0; } if (tok->fp_interactive && - tok_concatenate_interactive_new_line(tok, line) == -1) - { + tok_concatenate_interactive_new_line(tok, line) == -1) { return 0; } tok->inp += line_size; - if (tok->inp == tok->buf) - { + if (tok->inp == tok->buf) { return 0; } } while (tok->inp[-1] != '\n'); @@ -1110,24 +1009,19 @@ tok_readline_raw(struct tok_state *tok) } static int -tok_underflow_string(struct tok_state *tok) -{ +tok_underflow_string(struct tok_state *tok) { char *end = strchr(tok->inp, '\n'); - if (end != NULL) - { + if (end != NULL) { end++; } - else - { + else { end = strchr(tok->inp, '\0'); - if (end == tok->inp) - { + if (end == tok->inp) { tok->done = E_EOF; return 0; } } - if (tok->start == NULL) - { + if (tok->start == NULL) { tok->buf = tok->cur; } tok->line_start = tok->cur; @@ -1137,41 +1031,34 @@ tok_underflow_string(struct tok_state *tok) } static int -tok_underflow_interactive(struct tok_state *tok) -{ - if (tok->interactive_underflow == IUNDERFLOW_STOP) - { +tok_underflow_interactive(struct tok_state *tok) { + if (tok->interactive_underflow == IUNDERFLOW_STOP) { tok->done = E_INTERACT_STOP; return 1; } char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt); - if (newtok != NULL) - { + if (newtok != NULL) { char *translated = translate_newlines(newtok, 0, tok); PyMem_Free(newtok); - if (translated == NULL) - { + if (translated == NULL) { return 0; } newtok = translated; } - if (tok->encoding && newtok && *newtok) - { + if (tok->encoding && newtok && *newtok) { /* Recode to UTF-8 */ Py_ssize_t buflen; - const char *buf; + const char* buf; PyObject *u = translate_into_utf8(newtok, tok->encoding); PyMem_Free(newtok); - if (u == NULL) - { + if (u == NULL) { tok->done = E_DECODE; return 0; } buflen = PyBytes_GET_SIZE(u); buf = PyBytes_AS_STRING(u); - newtok = PyMem_Malloc(buflen + 1); - if (newtok == NULL) - { + newtok = PyMem_Malloc(buflen+1); + if (newtok == NULL) { Py_DECREF(u); tok->done = E_NOMEM; return 0; @@ -1180,32 +1067,26 @@ tok_underflow_interactive(struct tok_state *tok) Py_DECREF(u); } if (tok->fp_interactive && - tok_concatenate_interactive_new_line(tok, newtok) == -1) - { + tok_concatenate_interactive_new_line(tok, newtok) == -1) { PyMem_Free(newtok); return 0; } - if (tok->nextprompt != NULL) - { + if (tok->nextprompt != NULL) { tok->prompt = tok->nextprompt; } - if (newtok == NULL) - { + if (newtok == NULL) { tok->done = E_INTR; } - else if (*newtok == '\0') - { + else if (*newtok == '\0') { PyMem_Free(newtok); tok->done = E_EOF; } - else if (tok->start != NULL) - { + else if (tok->start != NULL) { Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf; remember_fstring_buffers(tok); size_t size = strlen(newtok); ADVANCE_LINENO(); - if (!tok_reserve_buf(tok, size + 1)) - { + if (!tok_reserve_buf(tok, size + 1)) { PyMem_Free(tok->buf); tok->buf = NULL; PyMem_Free(newtok); @@ -1217,8 +1098,7 @@ tok_underflow_interactive(struct tok_state *tok) tok->multi_line_start = tok->buf + cur_multi_line_start; restore_fstring_buffers(tok); } - else - { + else { remember_fstring_buffers(tok); ADVANCE_LINENO(); PyMem_Free(tok->buf); @@ -1229,65 +1109,52 @@ tok_underflow_interactive(struct tok_state *tok) tok->end = tok->inp + 1; restore_fstring_buffers(tok); } - if (tok->done != E_OK) - { - if (tok->prompt != NULL) - { + if (tok->done != E_OK) { + if (tok->prompt != NULL) { PySys_WriteStderr("\n"); } return 0; } - if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) - { + if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) { return 0; } return 1; } static int -tok_underflow_file(struct tok_state *tok) -{ - if (tok->start == NULL) - { +tok_underflow_file(struct tok_state *tok) { + if (tok->start == NULL) { tok->cur = tok->inp = tok->buf; } - if (tok->decoding_state == STATE_INIT) - { + if (tok->decoding_state == STATE_INIT) { /* We have not yet determined the encoding. If an encoding is found, use the file-pointer reader functions from now on. */ - if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) - { + if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) { error_ret(tok); return 0; } assert(tok->decoding_state != STATE_INIT); } /* Read until '\n' or EOF */ - if (tok->decoding_readline != NULL) - { + if (tok->decoding_readline != NULL) { /* We already have a codec associated with this input. */ - if (!tok_readline_recode(tok)) - { + if (!tok_readline_recode(tok)) { return 0; } } - else - { + else { /* We want a 'raw' read. */ - if (!tok_readline_raw(tok)) - { + if (!tok_readline_raw(tok)) { return 0; } } - if (tok->inp == tok->cur) - { + if (tok->inp == tok->cur) { tok->done = E_EOF; return 0; } - if (tok->inp[-1] != '\n') - { + if (tok->inp[-1] != '\n') { assert(tok->inp + 1 < tok->end); /* Last line does not end in \n, fake one */ *tok->inp++ = '\n'; @@ -1295,10 +1162,8 @@ tok_underflow_file(struct tok_state *tok) } ADVANCE_LINENO(); - if (tok->decoding_state != STATE_NORMAL) - { - if (tok->lineno > 2) - { + if (tok->decoding_state != STATE_NORMAL) { + if (tok->lineno > 2) { tok->decoding_state = STATE_NORMAL; } else if (!check_coding_spec(tok->cur, strlen(tok->cur), @@ -1309,8 +1174,7 @@ tok_underflow_file(struct tok_state *tok) } /* The default encoding is UTF-8, so make sure we don't have any non-UTF-8 sequences in it. */ - if (!tok->encoding && !ensure_utf8(tok->cur, tok)) - { + if (!tok->encoding && !ensure_utf8(tok->cur, tok)) { error_ret(tok); return 0; } @@ -1322,40 +1186,25 @@ tok_underflow_file(struct tok_state *tok) static void print_escape(FILE *f, const char *s, Py_ssize_t size) { - if (s == NULL) - { + if (s == NULL) { fputs("NULL", f); return; } putc('"', f); - while (size-- > 0) - { + while (size-- > 0) { unsigned char c = *s++; - switch (c) - { - case '\n': - fputs("\\n", f); - break; - case '\r': - fputs("\\r", f); - break; - case '\t': - fputs("\\t", f); - break; - case '\f': - fputs("\\f", f); - break; - case '\'': - fputs("\\'", f); - break; - case '"': - fputs("\\\"", f); - break; - default: - if (0x20 <= c && c <= 0x7f) - putc(c, f); - else - fprintf(f, "\\x%02x", c); + switch (c) { + case '\n': fputs("\\n", f); break; + case '\r': fputs("\\r", f); break; + case '\t': fputs("\\t", f); break; + case '\f': fputs("\\f", f); break; + case '\'': fputs("\\'", f); break; + case '"': fputs("\\\"", f); break; + default: + if (0x20 <= c && c <= 0x7f) + putc(c, f); + else + fprintf(f, "\\x%02x", c); } } putc('"', f); @@ -1368,46 +1217,37 @@ static int tok_nextc(struct tok_state *tok) { int rc; - for (;;) - { - if (tok->cur != tok->inp) - { + for (;;) { + if (tok->cur != tok->inp) { tok->col_offset++; return Py_CHARMASK(*tok->cur++); /* Fast path */ } - if (tok->done != E_OK) - { + if (tok->done != E_OK) { return EOF; } - if (tok->fp == NULL) - { + if (tok->fp == NULL) { rc = tok_underflow_string(tok); } - else if (tok->prompt != NULL) - { + else if (tok->prompt != NULL) { rc = tok_underflow_interactive(tok); } - else - { + else { rc = tok_underflow_file(tok); } #if defined(Py_DEBUG) - if (tok->debug) - { + if (tok->debug) { fprintf(stderr, "line[%d] = ", tok->lineno); print_escape(stderr, tok->cur, tok->inp - tok->cur); fprintf(stderr, " tok->done = %d\n", tok->done); } #endif - if (!rc) - { + if (!rc) { tok->cur = tok->inp; return EOF; } tok->line_start = tok->cur; - if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) - { + if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) { syntaxerror(tok, "source code cannot contain null bytes"); tok->cur = tok->inp; return EOF; @@ -1421,14 +1261,11 @@ tok_nextc(struct tok_state *tok) static void tok_backup(struct tok_state *tok, int c) { - if (c != EOF) - { - if (--tok->cur < tok->buf) - { + if (c != EOF) { + if (--tok->cur < tok->buf) { Py_FatalError("tokenizer beginning of buffer"); } - if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) - { + if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) { Py_FatalError("tok_backup: wrong character"); } tok->col_offset--; @@ -1442,43 +1279,36 @@ _syntaxerror_range(struct tok_state *tok, const char *format, { PyObject *errmsg, *errtext, *args; errmsg = PyUnicode_FromFormatV(format, vargs); - if (!errmsg) - { + if (!errmsg) { goto error; } errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start, "replace"); - if (!errtext) - { + if (!errtext) { goto error; } - if (col_offset == -1) - { + if (col_offset == -1) { col_offset = (int)PyUnicode_GET_LENGTH(errtext); } - if (end_col_offset == -1) - { + if (end_col_offset == -1) { end_col_offset = col_offset; } Py_ssize_t line_len = strcspn(tok->line_start, "\n"); - if (line_len != tok->cur - tok->line_start) - { + if (line_len != tok->cur - tok->line_start) { Py_DECREF(errtext); errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len, "replace"); } - if (!errtext) - { + if (!errtext) { goto error; } args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno, col_offset, errtext, tok->lineno, end_col_offset); - if (args) - { + if (args) { PyErr_SetObject(PyExc_SyntaxError, args); Py_DECREF(args); } @@ -1523,8 +1353,7 @@ indenterror(struct tok_state *tok) static int parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) { - if (!tok->report_warnings) - { + if (!tok->report_warnings) { return 0; } @@ -1533,16 +1362,13 @@ parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...) va_start(vargs, format); errmsg = PyUnicode_FromFormatV(format, vargs); va_end(vargs); - if (!errmsg) - { + if (!errmsg) { goto error; } if (PyErr_WarnExplicitObject(category, errmsg, tok->filename, - tok->lineno, NULL, NULL) < 0) - { - if (PyErr_ExceptionMatches(category)) - { + tok->lineno, NULL, NULL) < 0) { + if (PyErr_ExceptionMatches(category)) { /* Replace the DeprecationWarning exception with a SyntaxError to get a more accurate error report */ PyErr_Clear(); @@ -1563,31 +1389,28 @@ static int warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char) { - if (!tok->tok_report_warnings) - { + if (!tok->tok_report_warnings) { return 0; } PyObject *msg = PyUnicode_FromFormat( "invalid escape sequence '\\%c'", - (char)first_invalid_escape_char); + (char) first_invalid_escape_char + ); - if (msg == NULL) - { + if (msg == NULL) { return -1; } if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, tok->filename, - tok->lineno, NULL, NULL) < 0) - { + tok->lineno, NULL, NULL) < 0) { Py_DECREF(msg); - if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) - { + if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) { /* Replace the DeprecationWarning exception with a SyntaxError to get a more accurate error report */ PyErr_Clear(); - return syntaxerror(tok, "invalid escape sequence '\\%c'", (char)first_invalid_escape_char); + return syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char); } return -1; @@ -1602,22 +1425,18 @@ lookahead(struct tok_state *tok, const char *test) { const char *s = test; int res = 0; - while (1) - { + while (1) { int c = tok_nextc(tok); - if (*s == 0) - { + if (*s == 0) { res = !is_potential_identifier_char(c); } - else if (c == *s) - { + else if (c == *s) { s++; continue; } tok_backup(tok, c); - while (s != test) - { + while (s != test) { tok_backup(tok, *--s); } return res; @@ -1638,52 +1457,43 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind) * other keyword or identifier. */ int r = 0; - if (c == 'a') - { + if (c == 'a') { r = lookahead(tok, "nd"); } - else if (c == 'e') - { + else if (c == 'e') { r = lookahead(tok, "lse"); } - else if (c == 'f') - { + else if (c == 'f') { r = lookahead(tok, "or"); } - else if (c == 'i') - { + else if (c == 'i') { int c2 = tok_nextc(tok); - if (c2 == 'f' || c2 == 'n' || c2 == 's') - { + if (c2 == 'f' || c2 == 'n' || c2 == 's') { r = 1; } tok_backup(tok, c2); } - else if (c == 'o') - { + else if (c == 'o') { r = lookahead(tok, "r"); } - else if (c == 'n') - { + else if (c == 'n') { r = lookahead(tok, "ot"); } - if (r) - { + if (r) { tok_backup(tok, c); if (parser_warn(tok, PyExc_SyntaxWarning, - "invalid %s literal", kind)) + "invalid %s literal", kind)) { return 0; } tok_nextc(tok); } else /* In future releases, only error will remain. */ - if (is_potential_identifier_char(c)) - { - tok_backup(tok, c); - syntaxerror(tok, "invalid %s literal", kind); - return 0; - } + if (is_potential_identifier_char(c)) { + tok_backup(tok, c); + syntaxerror(tok, "invalid %s literal", kind); + return 0; + } return 1; } @@ -1697,39 +1507,31 @@ verify_identifier(struct tok_state *tok) if (tok->decoding_erred) return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); - if (s == NULL) - { - if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) - { + if (s == NULL) { + if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { tok->done = E_DECODE; } - else - { + else { tok->done = E_ERROR; } return 0; } Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s); - if (invalid < 0) - { + if (invalid < 0) { Py_DECREF(s); tok->done = E_ERROR; return 0; } assert(PyUnicode_GET_LENGTH(s) > 0); - if (invalid < PyUnicode_GET_LENGTH(s)) - { + if (invalid < PyUnicode_GET_LENGTH(s)) { Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid); - if (invalid + 1 < PyUnicode_GET_LENGTH(s)) - { + if (invalid + 1 < PyUnicode_GET_LENGTH(s)) { /* Determine the offset in UTF-8 encoded input */ Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1)); - if (s != NULL) - { + if (s != NULL) { Py_SETREF(s, PyUnicode_AsUTF8String(s)); } - if (s == NULL) - { + if (s == NULL) { tok->done = E_ERROR; return 0; } @@ -1739,12 +1541,10 @@ verify_identifier(struct tok_state *tok) // PyUnicode_FromFormatV() does not support %X char hex[9]; (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch); - if (Py_UNICODE_ISPRINTABLE(ch)) - { + if (Py_UNICODE_ISPRINTABLE(ch)) { syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex); } - else - { + else { syntaxerror(tok, "invalid non-printable character U+%s", hex); } return 0; @@ -1758,19 +1558,15 @@ tok_decimal_tail(struct tok_state *tok) { int c; - while (1) - { - do - { + while (1) { + do { c = tok_nextc(tok); } while (isdigit(c)); - if (c != '_') - { + if (c != '_') { break; } c = tok_nextc(tok); - if (!isdigit(c)) - { + if (!isdigit(c)) { tok_backup(tok, c); syntaxerror(tok, "invalid decimal literal"); return 0; @@ -1779,24 +1575,20 @@ tok_decimal_tail(struct tok_state *tok) return c; } + static inline int -tok_continuation_line(struct tok_state *tok) -{ +tok_continuation_line(struct tok_state *tok) { int c = tok_nextc(tok); - if (c != '\n') - { + if (c != '\n') { tok->done = E_LINECONT; return -1; } c = tok_nextc(tok); - if (c == EOF) - { + if (c == EOF) { tok->done = E_EOF; tok->cur = tok->inp; return -1; - } - else - { + } else { tok_backup(tok, c); } return c; @@ -1820,12 +1612,10 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st { assert((start == NULL && end == NULL) || (start != NULL && end != NULL)); token->level = tok->level; - if (ISSTRINGLIT(type)) - { + if (ISSTRINGLIT(type)) { token->lineno = tok->first_lineno; } - else - { + else { token->lineno = tok->lineno; } token->end_lineno = tok->lineno; @@ -1833,8 +1623,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st token->start = start; token->end = end; - if (start != NULL && end != NULL) - { + if (start != NULL && end != NULL) { token->col_offset = tok->starting_col_offset; token->end_col_offset = tok->col_offset; } @@ -1842,130 +1631,108 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st } static int -tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct token *token) +tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) { int c; int blankline, nonascii; const char *p_start = NULL; const char *p_end = NULL; -nextline: + nextline: tok->start = NULL; tok->starting_col_offset = -1; blankline = 0; /* Get indentation level */ - if (tok->atbol) - { + if (tok->atbol) { int col = 0; int altcol = 0; tok->atbol = 0; int cont_line_col = 0; - for (;;) - { + for (;;) { c = tok_nextc(tok); - if (c == ' ') - { + if (c == ' ') { col++, altcol++; } - else if (c == '\t') - { + else if (c == '\t') { col = (col / tok->tabsize + 1) * tok->tabsize; altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE; } - else if (c == '\014') - { /* Control-L (formfeed) */ + else if (c == '\014') {/* Control-L (formfeed) */ col = altcol = 0; /* For Emacs users */ } - else if (c == '\\') - { + else if (c == '\\') { // Indentation cannot be split over multiple physical lines // using backslashes. This means that if we found a backslash // preceded by whitespace, **the first one we find** determines // the level of indentation of whatever comes next. cont_line_col = cont_line_col ? cont_line_col : col; - if ((c = tok_continuation_line(tok)) == -1) - { + if ((c = tok_continuation_line(tok)) == -1) { return MAKE_TOKEN(ERRORTOKEN); } } - else - { + else { break; } } tok_backup(tok, c); - if (c == '#' || c == '\n') - { + if (c == '#' || c == '\n') { /* Lines with only whitespace and/or comments shouldn't affect the indentation and are not passed to the parser as NEWLINE tokens, except *totally* empty lines in interactive mode, which signal the end of a command group. */ - if (col == 0 && c == '\n' && tok->prompt != NULL) - { + if (col == 0 && c == '\n' && tok->prompt != NULL) { blankline = 0; /* Let it through */ } - else if (tok->prompt != NULL && tok->lineno == 1) - { + else if (tok->prompt != NULL && tok->lineno == 1) { /* In interactive mode, if the first line contains only spaces and/or a comment, let it through. */ blankline = 0; col = altcol = 0; } - else - { + else { blankline = 1; /* Ignore completely */ } /* We can't jump back right here since we still may need to skip to the end of a comment */ } - if (!blankline && tok->level == 0) - { + if (!blankline && tok->level == 0) { col = cont_line_col ? cont_line_col : col; altcol = cont_line_col ? cont_line_col : altcol; - if (col == tok->indstack[tok->indent]) - { + if (col == tok->indstack[tok->indent]) { /* No change */ - if (altcol != tok->altindstack[tok->indent]) - { + if (altcol != tok->altindstack[tok->indent]) { return MAKE_TOKEN(indenterror(tok)); } } - else if (col > tok->indstack[tok->indent]) - { + else if (col > tok->indstack[tok->indent]) { /* Indent -- always one */ - if (tok->indent + 1 >= MAXINDENT) - { + if (tok->indent+1 >= MAXINDENT) { tok->done = E_TOODEEP; tok->cur = tok->inp; return MAKE_TOKEN(ERRORTOKEN); } - if (altcol <= tok->altindstack[tok->indent]) - { + if (altcol <= tok->altindstack[tok->indent]) { return MAKE_TOKEN(indenterror(tok)); } tok->pendin++; tok->indstack[++tok->indent] = col; tok->altindstack[tok->indent] = altcol; } - else /* col < tok->indstack[tok->indent] */ - { + else /* col < tok->indstack[tok->indent] */ { /* Dedent -- any number, must be consistent */ while (tok->indent > 0 && - col < tok->indstack[tok->indent]) - { + col < tok->indstack[tok->indent]) { tok->pendin--; tok->indent--; } - if (col != tok->indstack[tok->indent]) - { + if (col != tok->indstack[tok->indent]) { tok->done = E_DEDENT; tok->cur = tok->inp; return MAKE_TOKEN(ERRORTOKEN); } - if (altcol != tok->altindstack[tok->indent]) - { + if (altcol != tok->altindstack[tok->indent]) { return MAKE_TOKEN(indenterror(tok)); } } @@ -1976,15 +1743,12 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t tok->starting_col_offset = tok->col_offset; /* Return pending indents/dedents */ - if (tok->pendin != 0) - { - if (tok->pendin < 0) - { + if (tok->pendin != 0) { + if (tok->pendin < 0) { tok->pendin++; return MAKE_TOKEN(DEDENT); } - else - { + else { tok->pendin--; return MAKE_TOKEN(INDENT); } @@ -1994,13 +1758,15 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t c = tok_nextc(tok); tok_backup(tok, c); /* Check if we are closing an async function */ - if (tok->async_def && !blankline + if (tok->async_def + && !blankline /* Due to some implementation artifacts of type comments, * a TYPE_COMMENT at the start of a function won't set an * indentation level and it will produce a NEWLINE after it. * To avoid spuriously ending an async function due to this, * wait until we have some non-newline char in front of us. */ - && c != '\n' && tok->level == 0 + && c != '\n' + && tok->level == 0 /* There was a NEWLINE after ASYNC DEF, so we're past the signature. */ && tok->async_def_nl @@ -2013,11 +1779,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t tok->async_def_nl = 0; } -again: + again: tok->start = NULL; /* Skip spaces */ - do - { + do { c = tok_nextc(tok); } while (c == ' ' || c == '\t' || c == '\014'); @@ -2026,44 +1791,33 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t tok->starting_col_offset = tok->col_offset - 1; /* Skip comment, unless it's a type comment */ - if (c == '#') - { + if (c == '#') { - if (INSIDE_FSTRING(tok)) - { + if (INSIDE_FSTRING(tok)) { return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'")); } const char *prefix, *p, *type_start; int current_starting_col_offset; - while (c != EOF && c != '\n') - { + while (c != EOF && c != '\n') { c = tok_nextc(tok); } - if (tok->type_comments) - { + if (tok->type_comments) { p = tok->start; current_starting_col_offset = tok->starting_col_offset; prefix = type_comment_prefix; - while (*prefix && p < tok->cur) - { - if (*prefix == ' ') - { - while (*p == ' ' || *p == '\t') - { + while (*prefix && p < tok->cur) { + if (*prefix == ' ') { + while (*p == ' ' || *p == '\t') { p++; current_starting_col_offset++; } - } - else if (*prefix == *p) - { + } else if (*prefix == *p) { p++; current_starting_col_offset++; - } - else - { + } else { break; } @@ -2071,35 +1825,33 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t } /* This is a type comment if we matched all of type_comment_prefix. */ - if (!*prefix) - { + if (!*prefix) { int is_type_ignore = 1; // +6 in order to skip the word 'ignore' const char *ignore_end = p + 6; const int ignore_end_col_offset = current_starting_col_offset + 6; - tok_backup(tok, c); /* don't eat the newline or EOF */ + tok_backup(tok, c); /* don't eat the newline or EOF */ type_start = p; /* A TYPE_IGNORE is "type: ignore" followed by the end of the token * or anything ASCII and non-alphanumeric. */ - is_type_ignore = (tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 && !(tok->cur > ignore_end && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); + is_type_ignore = ( + tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0 + && !(tok->cur > ignore_end + && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0])))); - if (is_type_ignore) - { + if (is_type_ignore) { p_start = ignore_end; p_end = tok->cur; /* If this type ignore is the only thing on the line, consume the newline also. */ - if (blankline) - { + if (blankline) { tok_nextc(tok); tok->atbol = 1; } return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset); - } - else - { + } else { p_start = type_start; p_end = tok->cur; return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset); @@ -2108,16 +1860,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t } } - if (tok->done == E_INTERACT_STOP) - { + if (tok->done == E_INTERACT_STOP) { return MAKE_TOKEN(ENDMARKER); } /* Check for EOF and errors now */ - if (c == EOF) - { - if (tok->level) - { + if (c == EOF) { + if (tok->level) { return MAKE_TOKEN(ERRORTOKEN); } return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN); @@ -2125,54 +1874,44 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t /* Identifier (most frequent token!) */ nonascii = 0; - if (is_potential_identifier_start(c)) - { + if (is_potential_identifier_start(c)) { /* Process the various legal combinations of b"", r"", u"", and f"". */ int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0; - while (1) - { + while (1) { if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B')) saw_b = 1; /* Since this is a backwards compatibility support literal we don't want to support it in arbitrary order like byte literals. */ - else if (!(saw_b || saw_u || saw_r || saw_f) && (c == 'u' || c == 'U')) - { + else if (!(saw_b || saw_u || saw_r || saw_f) + && (c == 'u'|| c == 'U')) { saw_u = 1; } /* ur"" and ru"" are not supported */ - else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) - { + else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) { saw_r = 1; } - else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) - { + else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) { saw_f = 1; } - else - { + else { break; } c = tok_nextc(tok); - if (c == '"' || c == '\'') - { - if (saw_f) - { + if (c == '"' || c == '\'') { + if (saw_f) { goto f_string_quote; } goto letter_quote; } } - while (is_potential_identifier_char(c)) - { - if (c >= 128) - { + while (is_potential_identifier_char(c)) { + if (c >= 128) { nonascii = 1; } c = tok_nextc(tok); } tok_backup(tok, c); - if (nonascii && !verify_identifier(tok)) - { + if (nonascii && !verify_identifier(tok)) { return MAKE_TOKEN(ERRORTOKEN); } @@ -2180,8 +1919,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t p_end = tok->cur; /* async/await parsing block. */ - if (tok->cur - tok->start == 5 && tok->start[0] == 'a') - { + if (tok->cur - tok->start == 5 && tok->start[0] == 'a') { /* May be an 'async' or 'await' token. For Python 3.7 or later we recognize them unconditionally. For Python 3.5 or 3.6 we recognize 'async' in front of 'def', and @@ -2190,20 +1928,16 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t but there's no *valid* Python 3.4 code that would be rejected, and async functions will be rejected in a later phase.) */ - if (!tok->async_hacks || tok->async_def) - { + if (!tok->async_hacks || tok->async_def) { /* Always recognize the keywords. */ - if (memcmp(tok->start, "async", 5) == 0) - { + if (memcmp(tok->start, "async", 5) == 0) { return MAKE_TOKEN(ASYNC); } - if (memcmp(tok->start, "await", 5) == 0) - { + if (memcmp(tok->start, "await", 5) == 0) { return MAKE_TOKEN(AWAIT); } } - else if (memcmp(tok->start, "async", 5) == 0) - { + else if (memcmp(tok->start, "async", 5) == 0) { /* The current token is 'async'. Look ahead one token to see if that is 'def'. */ @@ -2216,7 +1950,9 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t current_tok, &ahead_token); - if (ahead_tok_kind == NAME && ahead_tok.cur - ahead_tok.start == 3 && memcmp(ahead_tok.start, "def", 3) == 0) + if (ahead_tok_kind == NAME + && ahead_tok.cur - ahead_tok.start == 3 + && memcmp(ahead_tok.start, "def", 3) == 0) { /* The next token is going to be 'def', so instead of returning a plain NAME token, return ASYNC. */ @@ -2231,18 +1967,15 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t } /* Newline */ - if (c == '\n') - { + if (c == '\n') { tok->atbol = 1; - if (blankline || tok->level > 0) - { + if (blankline || tok->level > 0) { goto nextline; } p_start = tok->start; p_end = tok->cur - 1; /* Leave '\n' out of the string */ tok->cont_line = 0; - if (tok->async_def) - { + if (tok->async_def) { /* We're somewhere inside an 'async def' function, and we've encountered a NEWLINE after its signature. */ tok->async_def_nl = 1; @@ -2251,30 +1984,23 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t } /* Period or number starting with period? */ - if (c == '.') - { + if (c == '.') { c = tok_nextc(tok); - if (isdigit(c)) - { + if (isdigit(c)) { goto fraction; - } - else if (c == '.') - { + } else if (c == '.') { c = tok_nextc(tok); - if (c == '.') - { + if (c == '.') { p_start = tok->start; p_end = tok->cur; return MAKE_TOKEN(ELLIPSIS); } - else - { + else { tok_backup(tok, c); } tok_backup(tok, '.'); } - else - { + else { tok_backup(tok, c); } p_start = tok->start; @@ -2283,218 +2009,169 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t } /* Number */ - if (isdigit(c)) - { - if (c == '0') - { + if (isdigit(c)) { + if (c == '0') { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); - if (c == 'x' || c == 'X') - { + if (c == 'x' || c == 'X') { /* Hex */ c = tok_nextc(tok); - do - { - if (c == '_') - { + do { + if (c == '_') { c = tok_nextc(tok); } - if (!isxdigit(c)) - { + if (!isxdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid hexadecimal literal")); } - do - { + do { c = tok_nextc(tok); } while (isxdigit(c)); } while (c == '_'); - if (!verify_end_of_number(tok, c, "hexadecimal")) - { + if (!verify_end_of_number(tok, c, "hexadecimal")) { return MAKE_TOKEN(ERRORTOKEN); } } - else if (c == 'o' || c == 'O') - { + else if (c == 'o' || c == 'O') { /* Octal */ c = tok_nextc(tok); - do - { - if (c == '_') - { + do { + if (c == '_') { c = tok_nextc(tok); } - if (c < '0' || c >= '8') - { - if (isdigit(c)) - { + if (c < '0' || c >= '8') { + if (isdigit(c)) { return MAKE_TOKEN(syntaxerror(tok, - "invalid digit '%c' in octal literal", c)); + "invalid digit '%c' in octal literal", c)); } - else - { + else { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid octal literal")); } } - do - { + do { c = tok_nextc(tok); } while ('0' <= c && c < '8'); } while (c == '_'); - if (isdigit(c)) - { + if (isdigit(c)) { return MAKE_TOKEN(syntaxerror(tok, - "invalid digit '%c' in octal literal", c)); + "invalid digit '%c' in octal literal", c)); } - if (!verify_end_of_number(tok, c, "octal")) - { + if (!verify_end_of_number(tok, c, "octal")) { return MAKE_TOKEN(ERRORTOKEN); } } - else if (c == 'b' || c == 'B') - { + else if (c == 'b' || c == 'B') { /* Binary */ c = tok_nextc(tok); - do - { - if (c == '_') - { + do { + if (c == '_') { c = tok_nextc(tok); } - if (c != '0' && c != '1') - { - if (isdigit(c)) - { + if (c != '0' && c != '1') { + if (isdigit(c)) { return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } - else - { + else { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid binary literal")); } } - do - { + do { c = tok_nextc(tok); } while (c == '0' || c == '1'); } while (c == '_'); - if (isdigit(c)) - { + if (isdigit(c)) { return MAKE_TOKEN(syntaxerror(tok, "invalid digit '%c' in binary literal", c)); } - if (!verify_end_of_number(tok, c, "binary")) - { + if (!verify_end_of_number(tok, c, "binary")) { return MAKE_TOKEN(ERRORTOKEN); } } - else - { + else { int nonzero = 0; /* maybe old-style octal; c is first char of it */ /* in any case, allow '0' as a literal */ - while (1) - { - if (c == '_') - { + while (1) { + if (c == '_') { c = tok_nextc(tok); - if (!isdigit(c)) - { + if (!isdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } } - if (c != '0') - { + if (c != '0') { break; } c = tok_nextc(tok); } - char *zeros_end = tok->cur; - if (isdigit(c)) - { + char* zeros_end = tok->cur; + if (isdigit(c)) { nonzero = 1; c = tok_decimal_tail(tok); - if (c == 0) - { + if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } } - if (c == '.') - { + if (c == '.') { c = tok_nextc(tok); goto fraction; } - else if (c == 'e' || c == 'E') - { + else if (c == 'e' || c == 'E') { goto exponent; } - else if (c == 'j' || c == 'J') - { + else if (c == 'j' || c == 'J') { goto imaginary; } - else if (nonzero) - { + else if (nonzero) { /* Old-style octal: now disallowed. */ tok_backup(tok, c); return MAKE_TOKEN(syntaxerror_known_range( - tok, (int)(tok->start + 1 - tok->line_start), - (int)(zeros_end - tok->line_start), - "leading zeros in decimal integer " - "literals are not permitted; " - "use an 0o prefix for octal integers")); + tok, (int)(tok->start + 1 - tok->line_start), + (int)(zeros_end - tok->line_start), + "leading zeros in decimal integer " + "literals are not permitted; " + "use an 0o prefix for octal integers")); } - if (!verify_end_of_number(tok, c, "decimal")) - { + if (!verify_end_of_number(tok, c, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } } } - else - { + else { /* Decimal */ c = tok_decimal_tail(tok); - if (c == 0) - { + if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } { /* Accept floating point numbers. */ - if (c == '.') - { + if (c == '.') { c = tok_nextc(tok); - fraction: + fraction: /* Fraction */ - if (isdigit(c)) - { + if (isdigit(c)) { c = tok_decimal_tail(tok); - if (c == 0) - { + if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } } } - if (c == 'e' || c == 'E') - { + if (c == 'e' || c == 'E') { int e; - exponent: + exponent: e = c; /* Exponent part */ c = tok_nextc(tok); - if (c == '+' || c == '-') - { + if (c == '+' || c == '-') { c = tok_nextc(tok); - if (!isdigit(c)) - { + if (!isdigit(c)) { tok_backup(tok, c); return MAKE_TOKEN(syntaxerror(tok, "invalid decimal literal")); } - } - else if (!isdigit(c)) - { + } else if (!isdigit(c)) { tok_backup(tok, c); - if (!verify_end_of_number(tok, e, "decimal")) - { + if (!verify_end_of_number(tok, e, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } tok_backup(tok, e); @@ -2503,23 +2180,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t return MAKE_TOKEN(NUMBER); } c = tok_decimal_tail(tok); - if (c == 0) - { + if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } } - if (c == 'j' || c == 'J') - { + if (c == 'j' || c == 'J') { /* Imaginary part */ - imaginary: + imaginary: c = tok_nextc(tok); - if (!verify_end_of_number(tok, c, "imaginary")) - { + if (!verify_end_of_number(tok, c, "imaginary")) { return MAKE_TOKEN(ERRORTOKEN); } } - else if (!verify_end_of_number(tok, c, "decimal")) - { + else if (!verify_end_of_number(tok, c, "decimal")) { return MAKE_TOKEN(ERRORTOKEN); } } @@ -2530,11 +2203,10 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t return MAKE_TOKEN(NUMBER); } -f_string_quote: - if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) - { + f_string_quote: + if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) { int quote = c; - int quote_size = 1; /* 1 or 3 */ + int quote_size = 1; /* 1 or 3 */ /* Nodes of type STRING, especially multi line strings must be handled differently in order to get both @@ -2545,25 +2217,22 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t /* Find the quote size and start of string */ int after_quote = tok_nextc(tok); - if (after_quote == quote) - { + if (after_quote == quote) { int after_after_quote = tok_nextc(tok); - if (after_after_quote == quote) - { + if (after_after_quote == quote) { quote_size = 3; } - else - { + else { // TODO: Check this tok_backup(tok, after_after_quote); tok_backup(tok, after_quote); } } - if (after_quote != quote) - { + if (after_quote != quote) { tok_backup(tok, after_quote); } + p_start = tok->start; p_end = tok->cur; tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); @@ -2579,18 +2248,17 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t the_current_tok->last_expr_end = -1; the_current_tok->f_string_debug = 0; - switch (*tok->start) - { - case 'F': - case 'f': - the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; - break; - case 'R': - case 'r': - the_current_tok->f_string_raw = 1; - break; - default: - Py_UNREACHABLE(); + switch (*tok->start) { + case 'F': + case 'f': + the_current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r'; + break; + case 'R': + case 'r': + the_current_tok->f_string_raw = 1; + break; + default: + Py_UNREACHABLE(); } the_current_tok->curly_bracket_depth = 0; @@ -2598,12 +2266,11 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t return MAKE_TOKEN(FSTRING_START); } -letter_quote: + letter_quote: /* String */ - if (c == '\'' || c == '"') - { + if (c == '\'' || c == '"') { int quote = c; - int quote_size = 1; /* 1 or 3 */ + int quote_size = 1; /* 1 or 3 */ int end_quote_size = 0; /* Nodes of type STRING, especially multi line strings @@ -2615,31 +2282,25 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t /* Find the quote size and start of string */ c = tok_nextc(tok); - if (c == quote) - { + if (c == quote) { c = tok_nextc(tok); - if (c == quote) - { + if (c == quote) { quote_size = 3; } - else - { - end_quote_size = 1; /* empty string found */ + else { + end_quote_size = 1; /* empty string found */ } } - if (c != quote) - { + if (c != quote) { tok_backup(tok, c); } /* Get rest of string */ - while (end_quote_size != quote_size) - { + while (end_quote_size != quote_size) { c = tok_nextc(tok); if (tok->done == E_DECODE) break; - if (c == EOF || (quote_size == 1 && c == '\n')) - { + if (c == EOF || (quote_size == 1 && c == '\n')) { assert(tok->multi_line_start != NULL); // shift the tok_state's location into // the start of string, and report the error @@ -2650,8 +2311,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t int start = tok->lineno; tok->lineno = tok->first_lineno; - if (INSIDE_FSTRING(tok)) - { + if (INSIDE_FSTRING(tok)) { /* When we are in an f-string, before raising the * unterminated string literal error, check whether * does the initial quote matches with f-strings quotes @@ -2659,45 +2319,35 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t * so raise the proper error */ tokenizer_mode *the_current_tok = TOK_GET_MODE(tok); if (the_current_tok->f_string_quote == quote && - the_current_tok->f_string_quote_size == quote_size) - { + the_current_tok->f_string_quote_size == quote_size) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start)); } } - if (quote_size == 3) - { + if (quote_size == 3) { syntaxerror(tok, "unterminated triple-quoted string literal" - " (detected at line %d)", - start); - if (c != '\n') - { + " (detected at line %d)", start); + if (c != '\n') { tok->done = E_EOFS; } return MAKE_TOKEN(ERRORTOKEN); } - else - { + else { syntaxerror(tok, "unterminated string literal (detected at" - " line %d)", - start); - if (c != '\n') - { + " line %d)", start); + if (c != '\n') { tok->done = E_EOLS; } return MAKE_TOKEN(ERRORTOKEN); } } - if (c == quote) - { + if (c == quote) { end_quote_size += 1; } - else - { + else { end_quote_size = 0; - if (c == '\\') - { - tok_nextc(tok); /* skip escaped char */ + if (c == '\\') { + tok_nextc(tok); /* skip escaped char */ } } } @@ -2708,10 +2358,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t } /* Line continuation */ - if (c == '\\') - { - if ((c = tok_continuation_line(tok)) == -1) - { + if (c == '\\') { + if ((c = tok_continuation_line(tok)) == -1) { return MAKE_TOKEN(ERRORTOKEN); } tok->cont_line = 1; @@ -2720,23 +2368,19 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t /* Punctuation character */ int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{'); - if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) - { + if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) { /* This code block gets executed before the curly_bracket_depth is incremented * by the `{` case, so for ensuring that we are on the 0th level, we need * to adjust it manually */ int cursor = current_tok->curly_bracket_depth - (c != '{'); - if (cursor == 0 && !update_fstring_expr(tok, c)) - { + if (cursor == 0 && !update_fstring_expr(tok, c)) { return MAKE_TOKEN(ENDMARKER); } - if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) - { + if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) { return MAKE_TOKEN(ERRORTOKEN); } - if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) - { + if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) { current_tok->kind = TOK_FSTRING_MODE; p_start = tok->start; p_end = tok->cur; @@ -2748,16 +2392,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t { int c2 = tok_nextc(tok); int current_token = _PyToken_TwoChars(c, c2); - if (current_token != OP) - { + if (current_token != OP) { int c3 = tok_nextc(tok); int current_token3 = _PyToken_ThreeChars(c, c2, c3); - if (current_token3 != OP) - { + if (current_token3 != OP) { current_token = current_token3; } - else - { + else { tok_backup(tok, c3); } p_start = tok->start; @@ -2768,31 +2409,26 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t } /* Keep track of parentheses nesting level */ - switch (c) - { + switch (c) { case '(': case '[': case '{': - if (tok->level >= MAXLEVEL || tok->tok_mode_stack_index + 1 >= MAXLEVEL) - { + if (tok->level >= MAXLEVEL || tok->tok_mode_stack_index + 1 >= MAXLEVEL) { return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses")); } tok->parenstack[tok->level] = c; tok->parenlinenostack[tok->level] = tok->lineno; tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start); tok->level++; - if (INSIDE_FSTRING(tok)) - { + if (INSIDE_FSTRING(tok)) { current_tok->curly_bracket_depth++; } break; case ')': case ']': case '}': - if (!tok->level) - { - if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') - { + if (!tok->level) { + if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') { return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed")); } return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c)); @@ -2808,36 +2444,30 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t nested expression, then instead of matching a different syntactical construct with it; we'll throw an unmatched parentheses error. */ - if (INSIDE_FSTRING(tok) && opening == '{') - { + if (INSIDE_FSTRING(tok) && opening == '{') { assert(current_tok->curly_bracket_depth >= 0); int previous_bracket = current_tok->curly_bracket_depth - 1; - if (previous_bracket == current_tok->curly_bracket_expr_start_depth) - { + if (previous_bracket == current_tok->curly_bracket_expr_start_depth) { return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c)); } } - if (tok->parenlinenostack[tok->level] != tok->lineno) - { + if (tok->parenlinenostack[tok->level] != tok->lineno) { return MAKE_TOKEN(syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c' on line %d", - c, opening, tok->parenlinenostack[tok->level])); + "closing parenthesis '%c' does not match " + "opening parenthesis '%c' on line %d", + c, opening, tok->parenlinenostack[tok->level])); } - else - { + else { return MAKE_TOKEN(syntaxerror(tok, - "closing parenthesis '%c' does not match " - "opening parenthesis '%c'", - c, opening)); + "closing parenthesis '%c' does not match " + "opening parenthesis '%c'", + c, opening)); } } - if (INSIDE_FSTRING(tok)) - { + if (INSIDE_FSTRING(tok)) { current_tok->curly_bracket_depth--; - if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) - { + if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) { current_tok->curly_bracket_expr_start_depth--; current_tok->kind = TOK_FSTRING_MODE; current_tok->f_string_debug = 0; @@ -2848,15 +2478,13 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t break; } - if (!Py_UNICODE_ISPRINTABLE(c)) - { + if (!Py_UNICODE_ISPRINTABLE(c)) { char hex[9]; (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c); return MAKE_TOKEN(syntaxerror(tok, "invalid non-printable character U+%s", hex)); } - if (c == '=' && INSIDE_FSTRING_EXPR(current_tok)) - { + if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) { current_tok->f_string_debug = 1; } @@ -2867,7 +2495,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct t } static int -tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct token *token) +tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token) { const char *p_start = NULL; const char *p_end = NULL; @@ -2881,40 +2509,33 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize // before it. int start_char = tok_nextc(tok); - if (start_char == '{') - { + if (start_char == '{') { int peek1 = tok_nextc(tok); tok_backup(tok, peek1); tok_backup(tok, start_char); - if (peek1 != '{') - { + if (peek1 != '{') { current_tok->curly_bracket_expr_start_depth++; - if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) - { + if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); } TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; return tok_get_normal_mode(tok, current_tok, token); } } - else - { + else { tok_backup(tok, start_char); } // Check if we are at the end of the string - for (int i = 0; i < current_tok->f_string_quote_size; i++) - { + for (int i = 0; i < current_tok->f_string_quote_size; i++) { int quote = tok_nextc(tok); - if (quote != current_tok->f_string_quote) - { + if (quote != current_tok->f_string_quote) { tok_backup(tok, quote); goto f_string_middle; } } - if (current_tok->last_expr_buffer != NULL) - { + if (current_tok->last_expr_buffer != NULL) { PyMem_Free(current_tok->last_expr_buffer); current_tok->last_expr_buffer = NULL; current_tok->last_expr_size = 0; @@ -2928,13 +2549,10 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct f_string_middle: - while (end_quote_size != current_tok->f_string_quote_size) - { + while (end_quote_size != current_tok->f_string_quote_size) { int c = tok_nextc(tok); - if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) - { - if (tok->decoding_erred) - { + if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) { + if (tok->decoding_erred) { return MAKE_TOKEN(ERRORTOKEN); } @@ -2948,61 +2566,49 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct int start = tok->lineno; tok->lineno = tok->first_lineno; - if (current_tok->f_string_quote_size == 3) - { + if (current_tok->f_string_quote_size == 3) { return MAKE_TOKEN(syntaxerror(tok, - "unterminated triple-quoted f-string literal" - " (detected at line %d)", - start)); + "unterminated triple-quoted f-string literal" + " (detected at line %d)", start)); } - else - { + else { return MAKE_TOKEN(syntaxerror(tok, - "unterminated f-string literal (detected at" - " line %d)", - start)); + "unterminated f-string literal (detected at" + " line %d)", start)); } } - if (c == current_tok->f_string_quote) - { + if (c == current_tok->f_string_quote) { end_quote_size += 1; continue; - } - else - { + } else { end_quote_size = 0; } - int in_format_spec = (current_tok->last_expr_end != -1 && - INSIDE_FSTRING_EXPR(current_tok)); - if (c == '{') - { + int in_format_spec = ( + current_tok->last_expr_end != -1 + && + INSIDE_FSTRING_EXPR(current_tok) + ); + if (c == '{') { int peek = tok_nextc(tok); - if (peek != '{' || in_format_spec) - { + if (peek != '{' || in_format_spec) { tok_backup(tok, peek); tok_backup(tok, c); current_tok->curly_bracket_expr_start_depth++; - if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) - { + if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) { return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply")); } TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; p_start = tok->start; p_end = tok->cur; - } - else - { + } else { p_start = tok->start; p_end = tok->cur - 1; } return MAKE_TOKEN(FSTRING_MIDDLE); - } - else if (c == '}') - { - if (unicode_escape) - { + } else if (c == '}') { + if (unicode_escape) { p_start = tok->start; p_end = tok->cur; return MAKE_TOKEN(FSTRING_MIDDLE); @@ -3013,13 +2619,10 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct // scanning (indicated by the end of the expression being set) and we are not at the top level // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double // brackets, we can bypass it here. - if (peek == '}' && !in_format_spec) - { + if (peek == '}' && !in_format_spec) { p_start = tok->start; p_end = tok->cur - 1; - } - else - { + } else { tok_backup(tok, peek); tok_backup(tok, c); TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE; @@ -3027,19 +2630,14 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct p_end = tok->cur; } return MAKE_TOKEN(FSTRING_MIDDLE); - } - else if (c == '\\') - { + } else if (c == '\\') { int peek = tok_nextc(tok); // Special case when the backslash is right before a curly // brace. We have to restore and return the control back // to the loop for the next iteration. - if (peek == '{' || peek == '}') - { - if (!current_tok->f_string_raw) - { - if (warn_invalid_escape_sequence(tok, peek)) - { + if (peek == '{' || peek == '}') { + if (!current_tok->f_string_raw) { + if (warn_invalid_escape_sequence(tok, peek)) { return MAKE_TOKEN(ERRORTOKEN); } } @@ -3047,18 +2645,13 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct continue; } - if (!current_tok->f_string_raw) - { - if (peek == 'N') - { + if (!current_tok->f_string_raw) { + if (peek == 'N') { /* Handle named unicode escapes (\N{BULLET}) */ peek = tok_nextc(tok); - if (peek == '{') - { + if (peek == '{') { unicode_escape = 1; - } - else - { + } else { tok_backup(tok, peek); } } @@ -3070,8 +2663,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct // Backup the f-string quotes to emit a final FSTRING_MIDDLE and // add the quotes to the FSTRING_END in the next tokenizer iteration. - for (int i = 0; i < current_tok->f_string_quote_size; i++) - { + for (int i = 0; i < current_tok->f_string_quote_size; i++) { tok_backup(tok, current_tok->f_string_quote); } p_start = tok->start; @@ -3079,25 +2671,23 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode *current_tok, struct return MAKE_TOKEN(FSTRING_MIDDLE); } + static int tok_get(struct tok_state *tok, struct token *token) { tokenizer_mode *current_tok = TOK_GET_MODE(tok); - if (current_tok->kind == TOK_REGULAR_MODE) - { + if (current_tok->kind == TOK_REGULAR_MODE) { return tok_get_normal_mode(tok, current_tok, token); - } - else - { + } else { return tok_get_fstring_mode(tok, current_tok, token); } } -int _PyTokenizer_Get(struct tok_state *tok, struct token *token) +int +_PyTokenizer_Get(struct tok_state *tok, struct token *token) { int result = tok_get(tok, token); - if (tok->decoding_erred) - { + if (tok->decoding_erred) { result = ERRORTOKEN; tok->done = E_DECODE; } @@ -3107,8 +2697,7 @@ int _PyTokenizer_Get(struct tok_state *tok, struct token *token) #if defined(__wasi__) || (defined(__EMSCRIPTEN__) && (__EMSCRIPTEN_major__ >= 3)) // fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's // dup() emulation with open() is slow. -typedef union -{ +typedef union { void *cookie; int fd; } borrowed; @@ -3121,8 +2710,7 @@ borrow_read(void *cookie, char *buf, size_t size) } static FILE * -fdopen_borrow(int fd) -{ +fdopen_borrow(int fd) { // supports only reading. seek fails. close and write are no-ops. cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL}; borrowed b = {.fd = fd}; @@ -3130,11 +2718,9 @@ fdopen_borrow(int fd) } #else static FILE * -fdopen_borrow(int fd) -{ +fdopen_borrow(int fd) { fd = _Py_dup(fd); - if (fd < 0) - { + if (fd < 0) { return NULL; } return fdopen(fd, "r"); @@ -3159,25 +2745,20 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) char *encoding = NULL; fp = fdopen_borrow(fd); - if (fp == NULL) - { + if (fp == NULL) { return NULL; } tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL); - if (tok == NULL) - { + if (tok == NULL) { fclose(fp); return NULL; } - if (filename != NULL) - { + if (filename != NULL) { tok->filename = Py_NewRef(filename); } - else - { + else { tok->filename = PyUnicode_FromString(""); - if (tok->filename == NULL) - { + if (tok->filename == NULL) { fclose(fp); _PyTokenizer_Free(tok); return encoding; @@ -3187,16 +2768,13 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) // We don't want to report warnings here because it could cause infinite recursion // if fetching the encoding shows a warning. tok->report_warnings = 0; - while (tok->lineno < 2 && tok->done == E_OK) - { + while (tok->lineno < 2 && tok->done == E_OK) { _PyTokenizer_Get(tok, &token); } fclose(fp); - if (tok->encoding) - { + if (tok->encoding) { encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1); - if (encoding) - { + if (encoding) { strcpy(encoding, tok->encoding); } } @@ -3205,10 +2783,11 @@ _PyTokenizer_FindEncodingFilename(int fd, PyObject *filename) } #ifdef Py_DEBUG -void tok_dump(int type, char *start, char *end) +void +tok_dump(int type, char *start, char *end) { fprintf(stderr, "%s", _PyParser_TokenNames[type]); if (type == NAME || type == NUMBER || type == STRING || type == OP) fprintf(stderr, "(%.*s)", (int)(end - start), start); } -#endif // Py_DEBUG +#endif // Py_DEBUG From 2c48eaec680ca2fbcd4d68f0ffe0fb1e9b269fe0 Mon Sep 17 00:00:00 2001 From: jx124 <64946984+jx124@users.noreply.github.com> Date: Tue, 2 May 2023 02:11:45 +0800 Subject: [PATCH 6/8] Removed whitespace. --- Lib/test/test_fstring.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 81a64ce1becf4f..abd242ba40ac16 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1548,7 +1548,7 @@ def test_syntax_error_after_debug(self): "f'{1=}{1;'", "f'{1=}{1;}'", ]) - + def test_nested_fstring_max_stack_level(self): with self.assertRaises(SyntaxError): compile('f"{1 1:' + ('{f"1:' * 199), "?", "exec") From de9a495bddd4766f38a00b27b4c13ff3bf357ee6 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 1 May 2023 19:48:38 +0100 Subject: [PATCH 7/8] Use two separate limits and update tests --- Lib/test/test_fstring.py | 21 ++++++++++++++++----- Parser/tokenizer.c | 9 ++++++--- Parser/tokenizer.h | 7 ++++--- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index abd242ba40ac16..44d12b6ee48b8f 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -565,7 +565,23 @@ def test_fstring_nested_too_deeply(self): self.assertAllRaise(SyntaxError, "f-string: expressions nested too deeply", ['f"{1+2:{1+2:{1+1:{1}}}}"']) + + def create_nested_fstring(n): + if n == 0: + return "1+1" + prev = create_nested_fstring(n-1) + return f'f"{{{prev}}}"' + self.assertAllRaise(SyntaxError, + "too many nested f-strings", + [create_nested_fstring(160)]) + + def test_syntax_error_in_nested_fstring(self): + # See gh-104016 for more information on this crash + self.assertAllRaise(SyntaxError, + "invalid syntax", + ['f"{1 1:' + ('{f"1:' * 199)]) + def test_double_braces(self): self.assertEqual(f'{{', '{') self.assertEqual(f'a{{', 'a{') @@ -1355,7 +1371,6 @@ def test_filename_in_syntaxerror(self): # see issue 38964 with temp_cwd() as cwd: file_path = os.path.join(cwd, 't.py') - with open(file_path, 'w', encoding="utf-8") as f: f.write('f"{a b}"') # This generates a SyntaxError _, _, stderr = assert_python_failure(file_path, PYTHONIOENCODING='ascii') @@ -1549,9 +1564,5 @@ def test_syntax_error_after_debug(self): "f'{1=}{1;}'", ]) - def test_nested_fstring_max_stack_level(self): - with self.assertRaises(SyntaxError): - compile('f"{1 1:' + ('{f"1:' * 199), "?", "exec") - if __name__ == '__main__': unittest.main() diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 1f0e8362f9e21c..d2f9fee110ebf5 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -43,12 +43,12 @@ #ifdef Py_DEBUG static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index >= 0); - assert(tok->tok_mode_stack_index < MAXLEVEL); + assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL); return &(tok->tok_mode_stack[tok->tok_mode_stack_index]); } static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) { assert(tok->tok_mode_stack_index >= 0); - assert(tok->tok_mode_stack_index + 1 < MAXLEVEL); + assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL); return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]); } #else @@ -2235,6 +2235,9 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t p_start = tok->start; p_end = tok->cur; + if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) { + return MAKE_TOKEN(syntaxerror(tok, "too many nested f-strings")); + } tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok); the_current_tok->kind = TOK_FSTRING_MODE; the_current_tok->f_string_quote = quote; @@ -2413,7 +2416,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t case '(': case '[': case '{': - if (tok->level >= MAXLEVEL || tok->tok_mode_stack_index + 1 >= MAXLEVEL) { + if (tok->level >= MAXLEVEL) { return MAKE_TOKEN(syntaxerror(tok, "too many nested parentheses")); } tok->parenstack[tok->level] = c; diff --git a/Parser/tokenizer.h b/Parser/tokenizer.h index 8b4213c4ce3b5a..5e2171885ac75b 100644 --- a/Parser/tokenizer.h +++ b/Parser/tokenizer.h @@ -10,8 +10,9 @@ extern "C" { #include "pycore_token.h" /* For token types */ -#define MAXINDENT 100 /* Max indentation level */ -#define MAXLEVEL 200 /* Max parentheses level */ +#define MAXINDENT 100 /* Max indentation level */ +#define MAXLEVEL 200 /* Max parentheses level */ +#define MAXFSTRINGLEVEL 150 /* Max f-string nesting level */ enum decoding_state { STATE_INIT, @@ -123,7 +124,7 @@ struct tok_state { enum interactive_underflow_t interactive_underflow; int report_warnings; // TODO: Factor this into its own thing - tokenizer_mode tok_mode_stack[MAXLEVEL]; + tokenizer_mode tok_mode_stack[MAXFSTRINGLEVEL]; int tok_mode_stack_index; int tok_report_warnings; #ifdef Py_DEBUG From d3a2fcab4ec4e69506837d1cf57d1a26d646d280 Mon Sep 17 00:00:00 2001 From: Pablo Galindo Date: Mon, 1 May 2023 19:49:06 +0100 Subject: [PATCH 8/8] delete news entry --- Lib/test/test_fstring.py | 1 + .../2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst diff --git a/Lib/test/test_fstring.py b/Lib/test/test_fstring.py index 44d12b6ee48b8f..5c5176dc54a6d9 100644 --- a/Lib/test/test_fstring.py +++ b/Lib/test/test_fstring.py @@ -1371,6 +1371,7 @@ def test_filename_in_syntaxerror(self): # see issue 38964 with temp_cwd() as cwd: file_path = os.path.join(cwd, 't.py') + with open(file_path, 'w', encoding="utf-8") as f: f.write('f"{a b}"') # This generates a SyntaxError _, _, stderr = assert_python_failure(file_path, PYTHONIOENCODING='ascii') diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst b/Misc/NEWS.d/next/Core and Builtins/2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst deleted file mode 100644 index 5e721564f0cb42..00000000000000 --- a/Misc/NEWS.d/next/Core and Builtins/2023-05-01-16-35-02.gh-issue-104016.CjjY_3.rst +++ /dev/null @@ -1 +0,0 @@ -Fixed off-by-1 error in f-string tokenizer.