Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-121130: Fix f-string format specifiers with debug expressions #121150

Merged
merged 10 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Doc/library/ast.rst
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,7 @@ Literals
args=[
Name(id='a', ctx=Load())]),
conversion=-1,
format_spec=JoinedStr(
values=[
Constant(value='.3')]))]))
format_spec=Constant(value='.3'))]))


.. class:: List(elts, ctx)
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -3269,7 +3269,7 @@ def main():
('Expression', ('Subscript', (1, 0, 1, 10), ('List', (1, 0, 1, 3), [('Constant', (1, 1, 1, 2), 5, None)], ('Load',)), ('Slice', (1, 4, 1, 9), ('Constant', (1, 4, 1, 5), 1, None), ('Constant', (1, 6, 1, 7), 1, None), ('Constant', (1, 8, 1, 9), 1, None)), ('Load',))),
('Expression', ('IfExp', (1, 0, 1, 21), ('Name', (1, 9, 1, 10), 'x', ('Load',)), ('Call', (1, 0, 1, 5), ('Name', (1, 0, 1, 3), 'foo', ('Load',)), [], []), ('Call', (1, 16, 1, 21), ('Name', (1, 16, 1, 19), 'bar', ('Load',)), [], []))),
('Expression', ('JoinedStr', (1, 0, 1, 6), [('FormattedValue', (1, 2, 1, 5), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, None)])),
('Expression', ('JoinedStr', (1, 0, 1, 10), [('FormattedValue', (1, 2, 1, 9), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, ('JoinedStr', (1, 4, 1, 8), [('Constant', (1, 5, 1, 8), '.2f', None)]))])),
('Expression', ('JoinedStr', (1, 0, 1, 10), [('FormattedValue', (1, 2, 1, 9), ('Name', (1, 3, 1, 4), 'a', ('Load',)), -1, ('Constant', (1, 5, 1, 8), '.2f', None))])),
pablogsal marked this conversation as resolved.
Show resolved Hide resolved
('Expression', ('JoinedStr', (1, 0, 1, 8), [('FormattedValue', (1, 2, 1, 7), ('Name', (1, 3, 1, 4), 'a', ('Load',)), 114, None)])),
('Expression', ('JoinedStr', (1, 0, 1, 11), [('Constant', (1, 2, 1, 6), 'foo(', None), ('FormattedValue', (1, 6, 1, 9), ('Name', (1, 7, 1, 8), 'a', ('Load',)), -1, None), ('Constant', (1, 9, 1, 10), ')', None)])),
]
Expand Down
9 changes: 9 additions & 0 deletions Lib/test/test_fstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# Unicode identifiers in tests is allowed by PEP 3131.

import ast
import datetime
import dis
import os
import re
Expand Down Expand Up @@ -1601,6 +1602,12 @@ def f(a):
self.assertEqual(f'{f(a=4)}', '3=')
self.assertEqual(x, 4)

# Check debug expressions in format spec
y = 20
self.assertEqual(f"{2:{y=}}", "yyyyyyyyyyyyyyyyyyy2")
self.assertEqual(f"{datetime.datetime.now():h1{y=}h2{y=}h3{y=}}",
'h1y=20h2y=20h3y=20')

# Make sure __format__ is being called.
class C:
def __format__(self, s):
Expand All @@ -1614,9 +1621,11 @@ def __repr__(self):
self.assertEqual(f'{C()=: }', 'C()=FORMAT- ')
self.assertEqual(f'{C()=:x}', 'C()=FORMAT-x')
self.assertEqual(f'{C()=!r:*^20}', 'C()=********REPR********')
self.assertEqual(f"{C():{20=}}", 'FORMAT-20=20')

self.assertRaises(SyntaxError, eval, "f'{C=]'")


# Make sure leading and following text works.
x = 'foo'
self.assertEqual(f'X{x=}Y', 'Xx='+repr(x)+'Y')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix f-strings with debug expressions in format specifiers. Patch by Pablo
Galindo
63 changes: 41 additions & 22 deletions Parser/action_helpers.c
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,8 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv)
return result_token_with_metadata(p, conv, conv_token->metadata);
}

static asdl_expr_seq *
unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions);
ResultTokenWithMetadata *
_PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, int lineno, int col_offset,
int end_lineno, int end_col_offset, PyArena *arena)
Expand Down Expand Up @@ -1005,8 +1007,15 @@ _PyPegen_setup_full_format_spec(Parser *p, Token *colon, asdl_expr_seq *spec, in
assert(j == non_empty_count);
spec = resized_spec;
}
expr_ty res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno,
end_col_offset, p->arena);
expr_ty res;
if (asdl_seq_LEN(spec) == 0) {
res = _PyAST_JoinedStr(spec, lineno, col_offset, end_lineno,
end_col_offset, p->arena);
} else {
res = _PyPegen_concatenate_strings(p, spec,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is so we merge and concatenate the Constant and JoinedStr nodes. See how the tree originally looks like here: #121150 (comment)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why the tree in #121150 is false, but also running that same example with the latest status of your branch gives me the same exact one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the confusion. That tree is the correct one, but without this call it basically has a bunch of Constants all together and the Joined strings are nested (comment the code and check it out to see what I mean).

lineno, col_offset, end_lineno,
end_col_offset, arena);
}
if (!res) {
return NULL;
}
Expand Down Expand Up @@ -1306,6 +1315,7 @@ unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions)

expr_ty
_PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) {

asdl_expr_seq *expr = unpack_top_level_joined_strs(p, raw_expressions);
Py_ssize_t n_items = asdl_seq_LEN(expr);

Expand Down Expand Up @@ -1470,7 +1480,6 @@ expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, Re
debug_end_offset = end_col_offset;
debug_metadata = closing_brace->metadata;
}

expr_ty debug_text = _PyAST_Constant(debug_metadata, NULL, lineno, col_offset + 1, debug_end_line,
debug_end_offset - 1, p->arena);
if (!debug_text) {
Expand Down Expand Up @@ -1503,16 +1512,23 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
Py_ssize_t n_flattened_elements = 0;
for (i = 0; i < len; i++) {
expr_ty elem = asdl_seq_GET(strings, i);
if (elem->kind == Constant_kind) {
if (PyBytes_CheckExact(elem->v.Constant.value)) {
bytes_found = 1;
} else {
unicode_string_found = 1;
}
n_flattened_elements++;
} else {
n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
f_string_found = 1;
switch(elem->kind) {
case Constant_kind:
if (PyBytes_CheckExact(elem->v.Constant.value)) {
bytes_found = 1;
} else {
unicode_string_found = 1;
}
n_flattened_elements++;
break;
case JoinedStr_kind:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is to accommodate the case when we call this function with other things than JoinedStr and constants

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would this happen? It probably shouldn't, but I may be missing something.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because I am flattening collections that contain new nodes in https://github.com/python/cpython/pull/121150/files#r1662410566 (it's a new call over a new array that was not happening before)

n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
f_string_found = 1;
break;
default:
n_flattened_elements++;
f_string_found = 1;
break;
}
}

Expand Down Expand Up @@ -1554,16 +1570,19 @@ _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
Py_ssize_t j = 0;
for (i = 0; i < len; i++) {
expr_ty elem = asdl_seq_GET(strings, i);
if (elem->kind == Constant_kind) {
asdl_seq_SET(flattened, current_pos++, elem);
} else {
for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
if (subvalue == NULL) {
return NULL;
switch(elem->kind) {
case JoinedStr_kind:
for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
if (subvalue == NULL) {
return NULL;
}
asdl_seq_SET(flattened, current_pos++, subvalue);
}
asdl_seq_SET(flattened, current_pos++, subvalue);
}
break;
default:
asdl_seq_SET(flattened, current_pos++, elem);
break;
}
}

Expand Down
24 changes: 19 additions & 5 deletions Parser/lexer/lexer.c
Original file line number Diff line number Diff line change
Expand Up @@ -989,6 +989,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
the_current_tok->last_expr_buffer = NULL;
the_current_tok->last_expr_size = 0;
the_current_tok->last_expr_end = -1;
the_current_tok->in_format_spec = 0;
the_current_tok->f_string_debug = 0;

switch (*tok->start) {
Expand Down Expand Up @@ -1137,15 +1138,20 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
* by the `{` case, so for ensuring that we are on the 0th level, we need
* to adjust it manually */
int cursor = current_tok->curly_bracket_depth - (c != '{');
if (cursor == 0 && !_PyLexer_update_fstring_expr(tok, c)) {
int in_format_spec = current_tok->in_format_spec;
int cursor_in_format_with_debug =
cursor == 1 && (current_tok->f_string_debug || in_format_spec);
int cursor_valid = cursor == 0 || cursor_in_format_with_debug;
if ((cursor_valid) && !_PyLexer_update_fstring_expr(tok, c)) {
return MAKE_TOKEN(ENDMARKER);
}
if (cursor == 0 && c != '{' && set_fstring_expr(tok, token, c)) {
if ((cursor_valid) && c != '{' && set_fstring_expr(tok, token, c)) {
return MAKE_TOKEN(ERRORTOKEN);
}

if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
current_tok->kind = TOK_FSTRING_MODE;
current_tok->in_format_spec = 1;
p_start = tok->start;
p_end = tok->cur;
return MAKE_TOKEN(_PyToken_OneChar(c));
Expand Down Expand Up @@ -1235,6 +1241,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
current_tok->curly_bracket_expr_start_depth--;
current_tok->kind = TOK_FSTRING_MODE;
current_tok->in_format_spec = 0;
current_tok->f_string_debug = 0;
}
}
Expand Down Expand Up @@ -1317,11 +1324,11 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
tok->multi_line_start = tok->line_start;
while (end_quote_size != current_tok->f_string_quote_size) {
int c = tok_nextc(tok);
if (tok->done == E_ERROR) {
if (tok->done == E_ERROR || tok->done == E_DECODE) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a bug, as if we enter E_DECODE state we were not returning soon enough

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recently came across this in another branch I'm working on. If we check for this here, we can probably remove the if (tok->decoding_erred) about 10 lines below, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, will do

return MAKE_TOKEN(ERRORTOKEN);
}
int in_format_spec = (
current_tok->last_expr_end != -1
current_tok->in_format_spec
&&
INSIDE_FSTRING_EXPR(current_tok)
);
Expand All @@ -1337,6 +1344,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
if (in_format_spec && c == '\n') {
tok_backup(tok, c);
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
current_tok->in_format_spec = 0;
p_start = tok->start;
p_end = tok->cur;
return MAKE_TOKEN(FSTRING_MIDDLE);
Expand Down Expand Up @@ -1378,6 +1386,9 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
}

if (c == '{') {
if (!_PyLexer_update_fstring_expr(tok, c)) {
return MAKE_TOKEN(ENDMARKER);
}
int peek = tok_nextc(tok);
if (peek != '{' || in_format_spec) {
tok_backup(tok, peek);
Expand All @@ -1387,6 +1398,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expressions nested too deeply"));
}
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
current_tok->in_format_spec = 0;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't this always be 0 here, if we've correctly reset it when the format spec is ending?

p_start = tok->start;
p_end = tok->cur;
} else {
Expand All @@ -1406,13 +1418,15 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
// scanning (indicated by the end of the expression being set) and we are not at the top level
// of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
// brackets, we can bypass it here.
if (peek == '}' && !in_format_spec) {
int cursor = current_tok->curly_bracket_depth;
if (peek == '}' && !in_format_spec && cursor == 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need the cursor check here?

p_start = tok->start;
p_end = tok->cur - 1;
} else {
tok_backup(tok, peek);
tok_backup(tok, c);
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
current_tok->in_format_spec = 0;
p_start = tok->start;
p_end = tok->cur;
}
Expand Down
1 change: 1 addition & 0 deletions Parser/lexer/state.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ free_fstring_expressions(struct tok_state *tok)
mode->last_expr_buffer = NULL;
mode->last_expr_size = 0;
mode->last_expr_end = -1;
mode->in_format_spec = 0;
}
}
}
Expand Down
1 change: 1 addition & 0 deletions Parser/lexer/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ typedef struct _tokenizer_mode {
Py_ssize_t last_expr_end;
char* last_expr_buffer;
int f_string_debug;
int in_format_spec;
} tokenizer_mode;

/* Tokenizer state */
Expand Down
Loading