Skip to content

Commit

Permalink
pythongh-102856: Tokenize performance improvement (python#104731)
Browse files Browse the repository at this point in the history
  • Loading branch information
mgmacias95 authored May 22, 2023
1 parent 4b107d8 commit 8817886
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 13 deletions.
13 changes: 1 addition & 12 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,16 +449,6 @@ def _tokenize(rl_gen, encoding):
source = b"".join(rl_gen).decode(encoding)
token = None
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
# TODO: Marta -> limpiar esto
if 6 < token.type <= 54:
token = token._replace(type=OP)
if token.type in {ASYNC, AWAIT}:
token = token._replace(type=NAME)
if token.type == NEWLINE:
l_start, c_start = token.start
l_end, c_end = token.end
token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))

yield token
if token is not None:
last_line, _ = token.start
Expand Down Expand Up @@ -550,8 +540,7 @@ def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
import _tokenize as c_tokenizer
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
yield TokenInfo._make(info)


if __name__ == "__main__":
Expand Down
17 changes: 16 additions & 1 deletion Python/Python-tokenize.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,22 @@ tokenizeriter_next(tokenizeriterobject *it)
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
}

result = Py_BuildValue("(NinnnnN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
if (it->tok->tok_extra_tokens) {
// Necessary adjustments to match the original Python tokenize
// implementation
if (type > DEDENT && type < OP) {
type = OP;
}
else if (type == ASYNC || type == AWAIT) {
type = NAME;
}
else if (type == NEWLINE) {
str = PyUnicode_FromString("\n");
end_col_offset++;
}
}

result = Py_BuildValue("(iN(nn)(nn)N)", type, str, lineno, col_offset, end_lineno, end_col_offset, line);
exit:
_PyToken_Free(&token);
return result;
Expand Down

0 comments on commit 8817886

Please sign in to comment.