diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 000000000..59c989e69 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,11 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] \ No newline at end of file diff --git a/.editorconfig b/.editorconfig index 0824f6693..c4f3c65d8 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,6 +1,6 @@ root = true -[*.{py,pyi,toml,md}] +[*.{py,pyi,rs,toml,md}] charset = "utf-8" end_of_line = lf indent_size = 4 @@ -8,3 +8,7 @@ indent_style = space insert_final_newline = true trim_trailing_whitespace = true max_line_length = 88 + +[*.rs] +# https://github.com/rust-dev-tools/fmt-rfcs/blob/master/guide/guide.md +max_line_length = 100 diff --git a/.flake8 b/.flake8 index e681a88ca..cf63afc5c 100644 --- a/.flake8 +++ b/.flake8 @@ -106,6 +106,7 @@ exclude = .pyre, __pycache__, .tox, + native, max-complexity = 12 diff --git a/.github/workflows/.pyre_configuration.template b/.github/workflows/.pyre_configuration.template index dc25a33fb..4b5b20077 100644 --- a/.github/workflows/.pyre_configuration.template +++ b/.github/workflows/.pyre_configuration.template @@ -1,4 +1,7 @@ { + "exclude": [ + ".*\/native\/.*" + ], "source_directories": [ "." ], diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4ecbea3cf..ccdb91714 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,6 +35,7 @@ jobs: matrix: os: [ubuntu-latest, macos-latest, windows-latest] python-version: [3.6, 3.7, 3.8, 3.9] + parser: [pure, native] steps: - uses: actions/checkout@v1 - uses: actions/setup-python@v2 @@ -48,8 +49,14 @@ jobs: - name: Validate Dependencies if: steps.cache.outputs.cache-hit != 'true' run: exit 1 + - if: ${{ matrix.parser == 'native' }} + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + - run: >- + echo LIBCST_PARSER_TYPE=${{ matrix.parser }} >> $GITHUB_ENV - name: Run Tests - run: python -m unittest + run: python setup.py test # Run linters lint: @@ -166,10 +173,37 @@ jobs: # Build python package build: + name: Build wheels on ${{ matrix.os }}/${{ matrix.vers }} needs: setup - runs-on: ubuntu-latest + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - vers: i686 + os: ubuntu-20.04 + # aarch64 seems to be stuck + # - vers: aarch64 + # os: ubuntu-20.04 + - vers: auto64 + os: ubuntu-20.04 + - vers: arm64 + os: macos-10.15 + - vers: auto64 + os: macos-10.15 + - vers: auto64 + os: windows-2019 + env: + SCCACHE_VERSION: 0.2.13 + CIBW_BEFORE_ALL_LINUX: "curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y" + CIBW_BEFORE_ALL_MACOS: "rustup target add aarch64-apple-darwin x86_64-apple-darwin" + CIBW_BEFORE_ALL_WINDOWS: "rustup target add x86_64-pc-windows-msvc i686-pc-windows-msvc" + CIBW_ENVIRONMENT: 'PATH="$PATH:$HOME/.cargo/bin"' + CIBW_SKIP: "cp27-* cp34-* cp35-* pp* *-win32 *-win_arm64 *-musllinux_*" + CIBW_ARCHS: ${{ matrix.vers }} + CIBW_BUILD_VERBOSITY: 1 steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 - uses: actions/setup-python@v2 with: python-version: 3.9 @@ -185,13 +219,17 @@ jobs: if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} run: >- echo LIBCST_NO_LOCAL_SCHEME=1 >> $GITHUB_ENV - - name: Build a binary wheel and a source tarball + - name: Build wheels + uses: pypa/cibuildwheel@v2.3.1 + - uses: actions/upload-artifact@v2 + with: + path: wheelhouse/*.whl + - name: Build a source tarball run: >- python -m build --sdist - --wheel - --outdir dist/ + --outdir wheelhouse/ - if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} name: Publish distribution 📦 to Test PyPI uses: pypa/gh-action-pypi-publish@release/v1 @@ -199,3 +237,48 @@ jobs: user: __token__ password: ${{ secrets.TEST_PYPI_API_TOKEN }} repository_url: https://test.pypi.org/legacy/ + packages_dir: wheelhouse/ + +# Test rust parts + native: + name: Rust unit tests + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + toolchain: stable + components: rustfmt, clippy + - uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: test + uses: actions-rs/cargo@v1 + with: + command: test + args: --manifest-path=native/Cargo.toml + - name: clippy + uses: actions-rs/clippy-check@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + args: --manifest-path=native/Cargo.toml --all-features + + rustfmt: + name: Rustfmt + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - run: rustup component add rustfmt + - uses: actions-rs/cargo@v1 + with: + command: fmt + args: --all --manifest-path=native/Cargo.toml -- --check diff --git a/.gitignore b/.gitignore index 4a2bbd692..dbe480d7c 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ libcst/_version.py .hypothesis/ .pyre_configuration .python-version +target/ diff --git a/LICENSE b/LICENSE index 0c823502d..13df011c2 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ All contributions towards LibCST are MIT licensed. -Some Python files have been taken from the standard library and are therefore +Some Python files have been derived from the standard library and are therefore PSF licensed. Modifications on these files are dual licensed (both MIT and PSF). These files are: @@ -8,11 +8,13 @@ PSF). These files are: - libcst/_parser/parso/utils.py - libcst/_parser/parso/pgen2/generator.py - libcst/_parser/parso/pgen2/grammar_parser.py -- libcst/_parser/parso/python/token.py +- libcst/_parser/parso/python/py_token.py - libcst/_parser/parso/python/tokenize.py - libcst/_parser/parso/tests/test_fstring.py - libcst/_parser/parso/tests/test_tokenize.py - libcst/_parser/parso/tests/test_utils.py +- libcst_native/src/tokenize/core/mod.rs +- libcst_native/src/tokenize/core/string_types.rs Some Python files have been taken from dataclasses and are therefore Apache licensed. Modifications on these files are licensed under Apache 2.0 license. diff --git a/MANIFEST.in b/MANIFEST.in index 4402255d7..0f3912c6a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1 +1,4 @@ include README.rst LICENSE CODE_OF_CONDUCT.md CONTRIBUTING.md requirements.txt requirements-dev.txt docs/source/*.rst libcst/py.typed + +include native/Cargo.toml +recursive-include native * diff --git a/libcst/_nodes/tests/test_atom.py b/libcst/_nodes/tests/test_atom.py index da3ea9ef7..d6544e2f7 100644 --- a/libcst/_nodes/tests/test_atom.py +++ b/libcst/_nodes/tests/test_atom.py @@ -9,6 +9,7 @@ import libcst as cst from libcst import parse_expression from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as +from libcst._parser.entrypoints import is_native from libcst.metadata import CodeRange from libcst.testing.utils import data_provider @@ -1120,6 +1121,8 @@ def test_invalid(self, **kwargs: Any) -> None: ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_nodes/tests/test_dict.py b/libcst/_nodes/tests/test_dict.py index 425adb790..430be5885 100644 --- a/libcst/_nodes/tests/test_dict.py +++ b/libcst/_nodes/tests/test_dict.py @@ -8,6 +8,7 @@ import libcst as cst from libcst import parse_expression from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as +from libcst._parser.entrypoints import is_native from libcst.metadata import CodeRange from libcst.testing.utils import data_provider @@ -187,4 +188,6 @@ def test_invalid(self, **kwargs: Any) -> None: ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_nodes/tests/test_funcdef.py b/libcst/_nodes/tests/test_funcdef.py index a5d0d080d..4675f9181 100644 --- a/libcst/_nodes/tests/test_funcdef.py +++ b/libcst/_nodes/tests/test_funcdef.py @@ -8,6 +8,7 @@ import libcst as cst from libcst import parse_statement from libcst._nodes.tests.base import CSTNodeTest, DummyIndentedBlock, parse_statement_as +from libcst._parser.entrypoints import is_native from libcst.metadata import CodeRange from libcst.testing.utils import data_provider @@ -2041,4 +2042,6 @@ def test_valid_38(self, node: cst.CSTNode, code: str) -> None: ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_nodes/tests/test_import.py b/libcst/_nodes/tests/test_import.py index f911029c2..0da7c38c5 100644 --- a/libcst/_nodes/tests/test_import.py +++ b/libcst/_nodes/tests/test_import.py @@ -617,8 +617,10 @@ class ImportFromParseTest(CSTNodeTest): ), cst.ImportAlias(cst.Name("baz"), comma=cst.Comma()), ), + lpar=cst.LeftParen(), + rpar=cst.RightParen(), ), - "code": "from foo import bar, baz,", + "code": "from foo import (bar, baz,)", }, # Star import statement { diff --git a/libcst/_nodes/tests/test_list.py b/libcst/_nodes/tests/test_list.py index a4a08b952..e2f8bd182 100644 --- a/libcst/_nodes/tests/test_list.py +++ b/libcst/_nodes/tests/test_list.py @@ -8,6 +8,7 @@ import libcst as cst from libcst import parse_expression, parse_statement from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as +from libcst._parser.entrypoints import is_native from libcst.metadata import CodeRange from libcst.testing.utils import data_provider @@ -126,4 +127,6 @@ def test_invalid( ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_nodes/tests/test_matrix_multiply.py b/libcst/_nodes/tests/test_matrix_multiply.py index 9f50dd28a..b91147e69 100644 --- a/libcst/_nodes/tests/test_matrix_multiply.py +++ b/libcst/_nodes/tests/test_matrix_multiply.py @@ -11,6 +11,7 @@ parse_expression_as, parse_statement_as, ) +from libcst._parser.entrypoints import is_native from libcst.testing.utils import data_provider @@ -69,4 +70,6 @@ def test_valid(self, **kwargs: Any) -> None: ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_nodes/tests/test_module.py b/libcst/_nodes/tests/test_module.py index 671a23a61..57a8fd431 100644 --- a/libcst/_nodes/tests/test_module.py +++ b/libcst/_nodes/tests/test_module.py @@ -8,6 +8,7 @@ import libcst as cst from libcst import parse_module, parse_statement from libcst._nodes.tests.base import CSTNodeTest +from libcst._parser.entrypoints import is_native from libcst.metadata import CodeRange, MetadataWrapper, PositionProvider from libcst.testing.utils import data_provider @@ -83,6 +84,7 @@ def test_code_for_node( "empty_program_with_newline": { "code": "\n", "expected": cst.Module([], has_trailing_newline=True), + "enabled_for_native": False, }, "empty_program_with_comments": { "code": "# some comment\n", @@ -112,7 +114,11 @@ def test_code_for_node( }, } ) - def test_parser(self, *, code: str, expected: cst.Module) -> None: + def test_parser( + self, *, code: str, expected: cst.Module, enabled_for_native: bool = True + ) -> None: + if is_native() and not enabled_for_native: + self.skipTest("Disabled for native parser") self.assertEqual(parse_module(code), expected) @data_provider( diff --git a/libcst/_nodes/tests/test_set.py b/libcst/_nodes/tests/test_set.py index 434bf0ab0..3c55268f8 100644 --- a/libcst/_nodes/tests/test_set.py +++ b/libcst/_nodes/tests/test_set.py @@ -8,6 +8,7 @@ import libcst as cst from libcst import parse_expression from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as +from libcst._parser.entrypoints import is_native from libcst.testing.utils import data_provider @@ -133,4 +134,6 @@ def test_invalid( ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_nodes/tests/test_tuple.py b/libcst/_nodes/tests/test_tuple.py index f3a49bed8..6f3b9806d 100644 --- a/libcst/_nodes/tests/test_tuple.py +++ b/libcst/_nodes/tests/test_tuple.py @@ -8,6 +8,7 @@ import libcst as cst from libcst import parse_expression, parse_statement from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as +from libcst._parser.entrypoints import is_native from libcst.metadata import CodeRange from libcst.testing.utils import data_provider @@ -90,41 +91,6 @@ class TupleTest(CSTNodeTest): "parser": parse_expression, "expected_position": CodeRange((1, 1), (1, 11)), }, - # custom parenthesis on StarredElement - { - "node": cst.Tuple( - [ - cst.StarredElement( - cst.Name("abc"), - lpar=[cst.LeftParen()], - rpar=[cst.RightParen()], - comma=cst.Comma(), - ) - ] - ), - "code": "((*abc),)", - "parser": parse_expression, - "expected_position": CodeRange((1, 1), (1, 8)), - }, - # custom whitespace on StarredElement - { - "node": cst.Tuple( - [ - cst.Element(cst.Name("one"), comma=cst.Comma()), - cst.StarredElement( - cst.Name("two"), - whitespace_before_value=cst.SimpleWhitespace(" "), - lpar=[cst.LeftParen()], - rpar=[cst.RightParen()], - ), - ], - lpar=[], - rpar=[], # rpar can't own the trailing whitespace if it's not there - ), - "code": "one,(* two)", - "parser": parse_expression, - "expected_position": CodeRange((1, 0), (1, 12)), - }, # missing spaces around tuple, okay with parenthesis { "node": cst.For( @@ -279,4 +245,6 @@ def test_invalid( ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_nodes/tests/test_with.py b/libcst/_nodes/tests/test_with.py index b74487c78..7a53c2bea 100644 --- a/libcst/_nodes/tests/test_with.py +++ b/libcst/_nodes/tests/test_with.py @@ -8,6 +8,7 @@ import libcst as cst from libcst import PartialParserConfig, parse_statement from libcst._nodes.tests.base import CSTNodeTest, DummyIndentedBlock, parse_statement_as +from libcst._parser.entrypoints import is_native from libcst.metadata import CodeRange from libcst.testing.utils import data_provider @@ -230,4 +231,6 @@ def test_invalid(self, **kwargs: Any) -> None: ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_nodes/tests/test_yield.py b/libcst/_nodes/tests/test_yield.py index 83263beba..7194c04cc 100644 --- a/libcst/_nodes/tests/test_yield.py +++ b/libcst/_nodes/tests/test_yield.py @@ -8,6 +8,7 @@ import libcst as cst from libcst import parse_statement from libcst._nodes.tests.base import CSTNodeTest, parse_statement_as +from libcst._parser.entrypoints import is_native from libcst.helpers import ensure_type from libcst.metadata import CodeRange from libcst.testing.utils import data_provider @@ -240,4 +241,6 @@ def test_valid( ) ) def test_versions(self, **kwargs: Any) -> None: + if is_native() and not kwargs.get("expect_success", True): + self.skipTest("parse errors are disabled for native parser") self.assert_parses(**kwargs) diff --git a/libcst/_parser/conversions/expression.py b/libcst/_parser/conversions/expression.py index 59f2defa1..7d68a1686 100644 --- a/libcst/_parser/conversions/expression.py +++ b/libcst/_parser/conversions/expression.py @@ -581,8 +581,7 @@ def convert_atom_expr( return child -@with_production("atom_expr_await", "'await' atom_expr_trailer", version=">=3.7") -@with_production("atom_expr_await", "AWAIT atom_expr_trailer", version="<=3.6") +@with_production("atom_expr_await", "AWAIT atom_expr_trailer") def convert_atom_expr_await( config: ParserConfig, children: typing.Sequence[typing.Any] ) -> typing.Any: @@ -1509,8 +1508,7 @@ def convert_sync_comp_for( ) -@with_production("comp_for", "['async'] sync_comp_for", version=">=3.7") -@with_production("comp_for", "[ASYNC] sync_comp_for", version="==3.6") +@with_production("comp_for", "[ASYNC] sync_comp_for", version=">=3.6") @with_production("comp_for", "sync_comp_for", version="<=3.5") def convert_comp_for( config: ParserConfig, children: typing.Sequence[typing.Any] diff --git a/libcst/_parser/conversions/statement.py b/libcst/_parser/conversions/statement.py index ae0b1d17b..f6ac7fb66 100644 --- a/libcst/_parser/conversions/statement.py +++ b/libcst/_parser/conversions/statement.py @@ -1062,8 +1062,7 @@ def _extract_async( return (parse_empty_lines(config, whitespace_before), asyncnode, stmt.value) -@with_production("asyncable_funcdef", "['async'] funcdef", version=">=3.7") -@with_production("asyncable_funcdef", "[ASYNC] funcdef", version=">=3.5,<3.7") +@with_production("asyncable_funcdef", "[ASYNC] funcdef", version=">=3.5") @with_production("asyncable_funcdef", "funcdef", version="<3.5") def convert_asyncable_funcdef(config: ParserConfig, children: Sequence[Any]) -> Any: leading_lines, asyncnode, funcdef = _extract_async(config, children) @@ -1310,10 +1309,7 @@ def convert_decorated(config: ParserConfig, children: Sequence[Any]) -> Any: @with_production( - "asyncable_stmt", "['async'] (funcdef | with_stmt | for_stmt)", version=">=3.7" -) -@with_production( - "asyncable_stmt", "[ASYNC] (funcdef | with_stmt | for_stmt)", version=">=3.5,<3.7" + "asyncable_stmt", "[ASYNC] (funcdef | with_stmt | for_stmt)", version=">=3.5" ) @with_production("asyncable_stmt", "funcdef | with_stmt | for_stmt", version="<3.5") def convert_asyncable_stmt(config: ParserConfig, children: Sequence[Any]) -> Any: diff --git a/libcst/_parser/detect_config.py b/libcst/_parser/detect_config.py index ca13e7c22..ae3222c15 100644 --- a/libcst/_parser/detect_config.py +++ b/libcst/_parser/detect_config.py @@ -9,7 +9,7 @@ from dataclasses import dataclass from io import BytesIO from tokenize import detect_encoding as py_tokenize_detect_encoding -from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Union +from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Tuple, Union from libcst._nodes.whitespace import NEWLINE_RE from libcst._parser.parso.python.token import PythonTokenTypes, TokenType @@ -114,6 +114,23 @@ def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]: return frozenset(future_imports) +def convert_to_utf8( + source: Union[str, bytes], *, partial: PartialParserConfig +) -> Tuple[str, str]: + """ + Returns an (original encoding, converted source) tuple. + """ + partial_encoding = partial.encoding + encoding = ( + _detect_encoding(source) + if isinstance(partial_encoding, AutoConfig) + else partial_encoding + ) + + source_str = source if isinstance(source, str) else source.decode(encoding) + return (encoding, source_str) + + def detect_config( source: Union[str, bytes], *, @@ -128,14 +145,7 @@ def detect_config( python_version = partial.parsed_python_version - partial_encoding = partial.encoding - encoding = ( - _detect_encoding(source) - if isinstance(partial_encoding, AutoConfig) - else partial_encoding - ) - - source_str = source if isinstance(source, str) else source.decode(encoding) + encoding, source_str = convert_to_utf8(source, partial=partial) partial_default_newline = partial.default_newline default_newline = ( @@ -162,7 +172,7 @@ def detect_config( lines = split_lines(source_str, keepends=True) - tokens = tokenize_lines(lines, python_version) + tokens = tokenize_lines(source_str, lines, python_version) partial_default_indent = partial.default_indent if isinstance(partial_default_indent, AutoConfig): diff --git a/libcst/_parser/entrypoints.py b/libcst/_parser/entrypoints.py index f9e78ad89..28966cd1d 100644 --- a/libcst/_parser/entrypoints.py +++ b/libcst/_parser/entrypoints.py @@ -9,13 +9,15 @@ information """ +import os +from functools import partial from typing import Union from libcst._nodes.base import CSTNode from libcst._nodes.expression import BaseExpression from libcst._nodes.module import Module from libcst._nodes.statement import BaseCompoundStatement, SimpleStatementLine -from libcst._parser.detect_config import detect_config +from libcst._parser.detect_config import convert_to_utf8, detect_config from libcst._parser.grammar import get_grammar, validate_grammar from libcst._parser.python_parser import PythonCSTParser from libcst._parser.types.config import PartialParserConfig @@ -23,6 +25,11 @@ _DEFAULT_PARTIAL_PARSER_CONFIG: PartialParserConfig = PartialParserConfig() +def is_native() -> bool: + typ = os.environ.get("LIBCST_PARSER_TYPE", None) + return typ == "native" + + def _parse( entrypoint: str, source: Union[str, bytes], @@ -30,6 +37,38 @@ def _parse( *, detect_trailing_newline: bool, detect_default_newline: bool, +) -> CSTNode: + if is_native(): + from libcst.native import parse_module, parse_expression, parse_statement + + encoding, source_str = convert_to_utf8(source, partial=config) + + if entrypoint == "file_input": + parse = partial(parse_module, encoding=encoding) + elif entrypoint == "stmt_input": + parse = parse_statement + elif entrypoint == "expression_input": + parse = parse_expression + else: + raise ValueError(f"Unknown parser entry point: {entrypoint}") + + return parse(source_str) + return _pure_python_parse( + entrypoint, + source, + config, + detect_trailing_newline=detect_trailing_newline, + detect_default_newline=detect_default_newline, + ) + + +def _pure_python_parse( + entrypoint: str, + source: Union[str, bytes], + config: PartialParserConfig, + *, + detect_trailing_newline: bool, + detect_default_newline: bool, ) -> CSTNode: detection_result = detect_config( source, diff --git a/libcst/_parser/parso/python/py_token.py b/libcst/_parser/parso/python/py_token.py new file mode 100644 index 000000000..204ce94d9 --- /dev/null +++ b/libcst/_parser/parso/python/py_token.py @@ -0,0 +1,48 @@ +# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. +# Licensed to PSF under a Contributor Agreement. +# +# Modifications: +# Copyright David Halter and Contributors +# Modifications are dual-licensed: MIT and PSF. +# 99% of the code is different from pgen2, now. +# +# A fork of `parso.python.token`. +# https://github.com/davidhalter/parso/blob/master/parso/python/token.py +# +# The following changes were made: +# - Explicit TokenType references instead of dynamic creation. +# - Use dataclasses instead of raw classes. +# pyre-unsafe + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class TokenType: + name: str + contains_syntax: bool = False + + def __repr__(self) -> str: + return "%s(%s)" % (self.__class__.__name__, self.name) + + +class PythonTokenTypes: + """ + Basically an enum, but Python 2 doesn't have enums in the standard library. + """ + + STRING: TokenType = TokenType("STRING") + NUMBER: TokenType = TokenType("NUMBER") + NAME: TokenType = TokenType("NAME", contains_syntax=True) + ERRORTOKEN: TokenType = TokenType("ERRORTOKEN") + NEWLINE: TokenType = TokenType("NEWLINE") + INDENT: TokenType = TokenType("INDENT") + DEDENT: TokenType = TokenType("DEDENT") + ERROR_DEDENT: TokenType = TokenType("ERROR_DEDENT") + ASYNC: TokenType = TokenType("ASYNC") + AWAIT: TokenType = TokenType("AWAIT") + FSTRING_STRING: TokenType = TokenType("FSTRING_STRING") + FSTRING_START: TokenType = TokenType("FSTRING_START") + FSTRING_END: TokenType = TokenType("FSTRING_END") + OP: TokenType = TokenType("OP", contains_syntax=True) + ENDMARKER: TokenType = TokenType("ENDMARKER") diff --git a/libcst/_parser/parso/python/token.py b/libcst/_parser/parso/python/token.py index 204ce94d9..e4798f10e 100644 --- a/libcst/_parser/parso/python/token.py +++ b/libcst/_parser/parso/python/token.py @@ -1,48 +1,34 @@ -# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. -# Licensed to PSF under a Contributor Agreement. +# Copyright (c) Facebook, Inc. and its affiliates. # -# Modifications: -# Copyright David Halter and Contributors -# Modifications are dual-licensed: MIT and PSF. -# 99% of the code is different from pgen2, now. -# -# A fork of `parso.python.token`. -# https://github.com/davidhalter/parso/blob/master/parso/python/token.py -# -# The following changes were made: -# - Explicit TokenType references instead of dynamic creation. -# - Use dataclasses instead of raw classes. -# pyre-unsafe - -from dataclasses import dataclass - +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. -@dataclass(frozen=True) -class TokenType: - name: str - contains_syntax: bool = False +try: + from libcst_native import token_type as native_token_type - def __repr__(self) -> str: - return "%s(%s)" % (self.__class__.__name__, self.name) + TokenType = native_token_type.TokenType + class PythonTokenTypes: + STRING: TokenType = native_token_type.STRING + NUMBER: TokenType = native_token_type.NUMBER + NAME: TokenType = native_token_type.NAME + NEWLINE: TokenType = native_token_type.NEWLINE + INDENT: TokenType = native_token_type.INDENT + DEDENT: TokenType = native_token_type.DEDENT + ASYNC: TokenType = native_token_type.ASYNC + AWAIT: TokenType = native_token_type.AWAIT + FSTRING_STRING: TokenType = native_token_type.FSTRING_STRING + FSTRING_START: TokenType = native_token_type.FSTRING_START + FSTRING_END: TokenType = native_token_type.FSTRING_END + OP: TokenType = native_token_type.OP + ENDMARKER: TokenType = native_token_type.ENDMARKER + # unused dummy tokens for backwards compat with the parso tokenizer + ERRORTOKEN: TokenType = native_token_type.ERRORTOKEN + ERROR_DEDENT: TokenType = native_token_type.ERROR_DEDENT -class PythonTokenTypes: - """ - Basically an enum, but Python 2 doesn't have enums in the standard library. - """ - STRING: TokenType = TokenType("STRING") - NUMBER: TokenType = TokenType("NUMBER") - NAME: TokenType = TokenType("NAME", contains_syntax=True) - ERRORTOKEN: TokenType = TokenType("ERRORTOKEN") - NEWLINE: TokenType = TokenType("NEWLINE") - INDENT: TokenType = TokenType("INDENT") - DEDENT: TokenType = TokenType("DEDENT") - ERROR_DEDENT: TokenType = TokenType("ERROR_DEDENT") - ASYNC: TokenType = TokenType("ASYNC") - AWAIT: TokenType = TokenType("AWAIT") - FSTRING_STRING: TokenType = TokenType("FSTRING_STRING") - FSTRING_START: TokenType = TokenType("FSTRING_START") - FSTRING_END: TokenType = TokenType("FSTRING_END") - OP: TokenType = TokenType("OP", contains_syntax=True) - ENDMARKER: TokenType = TokenType("ENDMARKER") +except ImportError: + from libcst._parser.parso.python.py_token import ( # noqa F401 + PythonTokenTypes, + TokenType, + ) diff --git a/libcst/_parser/parso/python/tokenize.py b/libcst/_parser/parso/python/tokenize.py index e816cd62d..380246f00 100644 --- a/libcst/_parser/parso/python/tokenize.py +++ b/libcst/_parser/parso/python/tokenize.py @@ -995,7 +995,14 @@ def dedent_if_necessary(start): indents.append(indent) break if str.isidentifier(token): - yield PythonToken(NAME, token, spos, prefix) + # py37 doesn't need special tokens for async/await, and we could + # emit NAME, but then we'd need different grammar for py36 and py37. + if token == "async": + yield PythonToken(ASYNC, token, spos, prefix) + elif token == "await": + yield PythonToken(AWAIT, token, spos, prefix) + else: + yield PythonToken(NAME, token, spos, prefix) else: for t in _split_illegal_unicode_name(token, spos, prefix): yield t # yield from Python 2 diff --git a/libcst/_parser/py_whitespace_parser.py b/libcst/_parser/py_whitespace_parser.py new file mode 100644 index 000000000..6eabc8eaf --- /dev/null +++ b/libcst/_parser/py_whitespace_parser.py @@ -0,0 +1,260 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Sequence, Tuple, Union + +from libcst._nodes.whitespace import ( + COMMENT_RE, + NEWLINE_RE, + SIMPLE_WHITESPACE_RE, + Comment, + EmptyLine, + Newline, + ParenthesizedWhitespace, + SimpleWhitespace, + TrailingWhitespace, +) +from libcst._parser.types.config import BaseWhitespaceParserConfig +from libcst._parser.types.whitespace_state import WhitespaceState as State + +# BEGIN PARSER ENTRYPOINTS + + +def parse_simple_whitespace( + config: BaseWhitespaceParserConfig, state: State +) -> SimpleWhitespace: + # The match never fails because the pattern can match an empty string + lines = config.lines + # pyre-fixme[16]: Optional type has no attribute `group`. + ws_line = SIMPLE_WHITESPACE_RE.match(lines[state.line - 1], state.column).group(0) + ws_line_list = [ws_line] + while "\\" in ws_line: + # continuation character + state.line += 1 + state.column = 0 + ws_line = SIMPLE_WHITESPACE_RE.match(lines[state.line - 1], state.column).group( + 0 + ) + ws_line_list.append(ws_line) + + # TODO: we could special-case the common case where there's no continuation + # character to avoid list construction and joining. + + # once we've finished collecting continuation characters + state.column += len(ws_line) + return SimpleWhitespace("".join(ws_line_list)) + + +def parse_empty_lines( + config: BaseWhitespaceParserConfig, + state: State, + *, + override_absolute_indent: Optional[str] = None, +) -> Sequence[EmptyLine]: + # If override_absolute_indent is true, then we need to parse all lines up + # to and including the last line that is indented at our level. These all + # belong to the footer and not to the next line's leading_lines. All lines + # that have indent=False and come after the last line where indent=True + # do not belong to this node. + state_for_line = State( + state.line, state.column, state.absolute_indent, state.is_parenthesized + ) + lines: List[Tuple[State, EmptyLine]] = [] + while True: + el = _parse_empty_line( + config, state_for_line, override_absolute_indent=override_absolute_indent + ) + if el is None: + break + + # Store the updated state with the element we parsed. Then make a new state + # clone for the next element. + lines.append((state_for_line, el)) + state_for_line = State( + state_for_line.line, + state_for_line.column, + state.absolute_indent, + state.is_parenthesized, + ) + + if override_absolute_indent is not None: + # We need to find the last element that is indented, and then split the list + # at that point. + for i in range(len(lines) - 1, -1, -1): + if lines[i][1].indent: + lines = lines[: (i + 1)] + break + else: + # We didn't find any lines, throw them all away + lines = [] + + if lines: + # Update the state line and column to match the last line actually parsed. + final_state: State = lines[-1][0] + state.line = final_state.line + state.column = final_state.column + return [r[1] for r in lines] + + +def parse_trailing_whitespace( + config: BaseWhitespaceParserConfig, state: State +) -> TrailingWhitespace: + trailing_whitespace = _parse_trailing_whitespace(config, state) + if trailing_whitespace is None: + raise Exception( + "Internal Error: Failed to parse TrailingWhitespace. This should never " + + "happen because a TrailingWhitespace is never optional in the grammar, " + + "so this error should've been caught by parso first." + ) + return trailing_whitespace + + +def parse_parenthesizable_whitespace( + config: BaseWhitespaceParserConfig, state: State +) -> Union[SimpleWhitespace, ParenthesizedWhitespace]: + if state.is_parenthesized: + # First, try parenthesized (don't need speculation because it either + # parses or doesn't modify state). + parenthesized_whitespace = _parse_parenthesized_whitespace(config, state) + if parenthesized_whitespace is not None: + return parenthesized_whitespace + # Now, just parse and return a simple whitespace + return parse_simple_whitespace(config, state) + + +# END PARSER ENTRYPOINTS +# BEGIN PARSER INTERNAL PRODUCTIONS + + +def _parse_empty_line( + config: BaseWhitespaceParserConfig, + state: State, + *, + override_absolute_indent: Optional[str] = None, +) -> Optional[EmptyLine]: + # begin speculative parsing + speculative_state = State( + state.line, state.column, state.absolute_indent, state.is_parenthesized + ) + try: + indent = _parse_indent( + config, speculative_state, override_absolute_indent=override_absolute_indent + ) + except Exception: + # We aren't on a new line, speculative parsing failed + return None + whitespace = parse_simple_whitespace(config, speculative_state) + comment = _parse_comment(config, speculative_state) + newline = _parse_newline(config, speculative_state) + if newline is None: + # speculative parsing failed + return None + # speculative parsing succeeded + state.line = speculative_state.line + state.column = speculative_state.column + # don't need to copy absolute_indent/is_parenthesized because they don't change. + return EmptyLine(indent, whitespace, comment, newline) + + +def _parse_indent( + config: BaseWhitespaceParserConfig, + state: State, + *, + override_absolute_indent: Optional[str] = None, +) -> bool: + """ + Returns True if indentation was found, otherwise False. + """ + absolute_indent = ( + override_absolute_indent + if override_absolute_indent is not None + else state.absolute_indent + ) + line_str = config.lines[state.line - 1] + if state.column != 0: + if state.column == len(line_str) and state.line == len(config.lines): + # We're at EOF, treat this as a failed speculative parse + return False + raise Exception("Internal Error: Column should be 0 when parsing an indent.") + if line_str.startswith(absolute_indent, state.column): + state.column += len(absolute_indent) + return True + return False + + +def _parse_comment( + config: BaseWhitespaceParserConfig, state: State +) -> Optional[Comment]: + comment_match = COMMENT_RE.match(config.lines[state.line - 1], state.column) + if comment_match is None: + return None + comment = comment_match.group(0) + state.column += len(comment) + return Comment(comment) + + +def _parse_newline( + config: BaseWhitespaceParserConfig, state: State +) -> Optional[Newline]: + # begin speculative parsing + line_str = config.lines[state.line - 1] + newline_match = NEWLINE_RE.match(line_str, state.column) + if newline_match is not None: + # speculative parsing succeeded + newline_str = newline_match.group(0) + state.column += len(newline_str) + if state.column != len(line_str): + raise Exception("Internal Error: Found a newline, but it wasn't the EOL.") + if state.line < len(config.lines): + # this newline was the end of a line, and there's another line, + # therefore we should move to the next line + state.line += 1 + state.column = 0 + if newline_str == config.default_newline: + # Just inherit it from the Module instead of explicitly setting it. + return Newline() + else: + return Newline(newline_str) + else: # no newline was found, speculative parsing failed + return None + + +def _parse_trailing_whitespace( + config: BaseWhitespaceParserConfig, state: State +) -> Optional[TrailingWhitespace]: + # Begin speculative parsing + speculative_state = State( + state.line, state.column, state.absolute_indent, state.is_parenthesized + ) + whitespace = parse_simple_whitespace(config, speculative_state) + comment = _parse_comment(config, speculative_state) + newline = _parse_newline(config, speculative_state) + if newline is None: + # Speculative parsing failed + return None + # Speculative parsing succeeded + state.line = speculative_state.line + state.column = speculative_state.column + # don't need to copy absolute_indent/is_parenthesized because they don't change. + return TrailingWhitespace(whitespace, comment, newline) + + +def _parse_parenthesized_whitespace( + config: BaseWhitespaceParserConfig, state: State +) -> Optional[ParenthesizedWhitespace]: + first_line = _parse_trailing_whitespace(config, state) + if first_line is None: + # Speculative parsing failed + return None + empty_lines = () + while True: + empty_line = _parse_empty_line(config, state) + if empty_line is None: + # This isn't an empty line, so parse it below + break + empty_lines = empty_lines + (empty_line,) + indent = _parse_indent(config, state) + last_line = parse_simple_whitespace(config, state) + return ParenthesizedWhitespace(first_line, empty_lines, indent, last_line) diff --git a/libcst/_parser/tests/test_detect_config.py b/libcst/_parser/tests/test_detect_config.py index b17c9fe58..fdda965b8 100644 --- a/libcst/_parser/tests/test_detect_config.py +++ b/libcst/_parser/tests/test_detect_config.py @@ -3,12 +3,15 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import dataclasses from typing import Union from libcst._parser.detect_config import detect_config from libcst._parser.parso.utils import PythonVersionInfo -from libcst._parser.types.config import ParserConfig, PartialParserConfig +from libcst._parser.types.config import ( + ParserConfig, + PartialParserConfig, + parser_config_asdict, +) from libcst.testing.utils import UnitTest, data_provider @@ -316,7 +319,7 @@ def test_detect_module_config( expected_config: ParserConfig, ) -> None: self.assertEqual( - dataclasses.asdict( + parser_config_asdict( detect_config( source, partial=partial, @@ -324,5 +327,5 @@ def test_detect_module_config( detect_default_newline=detect_default_newline, ).config ), - dataclasses.asdict(expected_config), + parser_config_asdict(expected_config), ) diff --git a/libcst/_parser/tests/test_footer_behavior.py b/libcst/_parser/tests/test_footer_behavior.py index 23ff4e256..f3df77f84 100644 --- a/libcst/_parser/tests/test_footer_behavior.py +++ b/libcst/_parser/tests/test_footer_behavior.py @@ -15,7 +15,10 @@ class FooterBehaviorTest(UnitTest): @data_provider( { # Literally the most basic example - "simple_module": {"code": "\n", "expected_module": cst.Module(body=())}, + "simple_module": { + "code": "", + "expected_module": cst.Module(body=(), has_trailing_newline=False), + }, # A module with a header comment "header_only_module": { "code": "# This is a header comment\n", diff --git a/libcst/_parser/tests/test_parse_errors.py b/libcst/_parser/tests/test_parse_errors.py index 6d651f3b3..17bf3581b 100644 --- a/libcst/_parser/tests/test_parse_errors.py +++ b/libcst/_parser/tests/test_parse_errors.py @@ -8,6 +8,7 @@ from typing import Callable import libcst as cst +from libcst._parser.entrypoints import is_native from libcst.testing.utils import UnitTest, data_provider @@ -169,4 +170,5 @@ def test_parser_syntax_error_str( ) -> None: with self.assertRaises(cst.ParserSyntaxError) as cm: parse_fn() - self.assertEqual(str(cm.exception), expected) + if not is_native(): + self.assertEqual(str(cm.exception), expected) diff --git a/libcst/_parser/tests/test_whitespace_parser.py b/libcst/_parser/tests/test_whitespace_parser.py index 17996b472..22be2412e 100644 --- a/libcst/_parser/tests/test_whitespace_parser.py +++ b/libcst/_parser/tests/test_whitespace_parser.py @@ -3,12 +3,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from dataclasses import dataclass -from typing import Callable, Sequence, TypeVar +from typing import Callable, TypeVar import libcst as cst from libcst._nodes.deep_equals import deep_equals -from libcst._parser.types.config import BaseWhitespaceParserConfig +from libcst._parser.types.config import MockWhitespaceParserConfig as Config from libcst._parser.types.whitespace_state import WhitespaceState as State from libcst._parser.whitespace_parser import ( parse_empty_lines, @@ -20,12 +19,6 @@ _T = TypeVar("_T") -@dataclass(frozen=True) -class Config(BaseWhitespaceParserConfig): - lines: Sequence[str] - default_newline: str - - class WhitespaceParserTest(UnitTest): @data_provider( { diff --git a/libcst/_parser/tests/test_wrapped_tokenize.py b/libcst/_parser/tests/test_wrapped_tokenize.py index e131f6208..ead004324 100644 --- a/libcst/_parser/tests/test_wrapped_tokenize.py +++ b/libcst/_parser/tests/test_wrapped_tokenize.py @@ -1214,7 +1214,7 @@ class WrappedTokenizeTest(UnitTest): _PY37, ( Token( - type=PythonTokenTypes.NAME, + type=PythonTokenTypes.ASYNC, string="async", start_pos=(1, 0), end_pos=(1, 5), @@ -1364,7 +1364,7 @@ class WrappedTokenizeTest(UnitTest): relative_indent=None, ), Token( - type=PythonTokenTypes.NAME, + type=PythonTokenTypes.AWAIT, string="await", start_pos=(2, 11), end_pos=(2, 16), @@ -1649,7 +1649,7 @@ class WrappedTokenizeTest(UnitTest): _PY38, ( Token( - type=PythonTokenTypes.NAME, + type=PythonTokenTypes.ASYNC, string="async", start_pos=(1, 0), end_pos=(1, 5), @@ -1799,7 +1799,7 @@ class WrappedTokenizeTest(UnitTest): relative_indent=None, ), Token( - type=PythonTokenTypes.NAME, + type=PythonTokenTypes.AWAIT, string="await", start_pos=(2, 11), end_pos=(2, 16), diff --git a/libcst/_parser/types/config.py b/libcst/_parser/types/config.py index 13778b2ae..bf244f1cb 100644 --- a/libcst/_parser/types/config.py +++ b/libcst/_parser/types/config.py @@ -3,14 +3,12 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. - -import abc import codecs import re import sys from dataclasses import dataclass, field, fields from enum import Enum -from typing import FrozenSet, List, Optional, Pattern, Sequence, Union +from typing import Any, Callable, FrozenSet, List, Mapping, Optional, Pattern, Union from libcst._add_slots import add_slots from libcst._nodes.whitespace import NEWLINE_RE @@ -18,33 +16,21 @@ _INDENT_RE: Pattern[str] = re.compile(r"[ \t]+") +try: + from libcst_native import parser_config as config_mod -class BaseWhitespaceParserConfig(abc.ABC): - """ - Represents the subset of ParserConfig that the whitespace parser requires. This - makes calling the whitespace parser in tests with a mocked configuration easier. - """ - - lines: Sequence[str] - default_newline: str - + MockWhitespaceParserConfig = config_mod.BaseWhitespaceParserConfig +except ImportError: + from libcst._parser.types import py_config as config_mod -@add_slots # We'll access these properties frequently, so use slots -@dataclass(frozen=True) -class ParserConfig(BaseWhitespaceParserConfig): - """ - An internal configuration object that the python parser passes around. These values - are global to the parsed code and should not change during the lifetime of the - parser object. - """ + # pyre-fixme[9]: This is a small implementation difference between native and python + MockWhitespaceParserConfig = config_mod.MockWhitespaceParserConfig - lines: Sequence[str] - encoding: str - default_indent: str - default_newline: str - has_trailing_newline: bool - version: PythonVersionInfo - future_imports: FrozenSet[str] +BaseWhitespaceParserConfig = config_mod.BaseWhitespaceParserConfig +ParserConfig = config_mod.ParserConfig +parser_config_asdict: Callable[ + [ParserConfig], Mapping[str, Any] +] = config_mod.parser_config_asdict class AutoConfig(Enum): diff --git a/libcst/_parser/types/py_config.py b/libcst/_parser/types/py_config.py new file mode 100644 index 000000000..6722a9eae --- /dev/null +++ b/libcst/_parser/types/py_config.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import abc +from dataclasses import asdict, dataclass +from typing import Any, FrozenSet, Mapping, Sequence + +from libcst._parser.parso.utils import PythonVersionInfo + + +class BaseWhitespaceParserConfig(abc.ABC): + """ + Represents the subset of ParserConfig that the whitespace parser requires. This + makes calling the whitespace parser in tests with a mocked configuration easier. + """ + + lines: Sequence[str] + default_newline: str + + +@dataclass(frozen=True) +class MockWhitespaceParserConfig(BaseWhitespaceParserConfig): + """ + An internal type used by unit tests. + """ + + lines: Sequence[str] + default_newline: str + + +@dataclass(frozen=True) +class ParserConfig(BaseWhitespaceParserConfig): + """ + An internal configuration object that the python parser passes around. These + values are global to the parsed code and should not change during the lifetime + of the parser object. + """ + + lines: Sequence[str] + encoding: str + default_indent: str + default_newline: str + has_trailing_newline: bool + version: PythonVersionInfo + future_imports: FrozenSet[str] + + +def parser_config_asdict(config: ParserConfig) -> Mapping[str, Any]: + """ + An internal helper function used by unit tests to compare configs. + """ + return asdict(config) diff --git a/libcst/_parser/types/py_token.py b/libcst/_parser/types/py_token.py new file mode 100644 index 000000000..60ddb2a2e --- /dev/null +++ b/libcst/_parser/types/py_token.py @@ -0,0 +1,27 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from dataclasses import dataclass +from typing import Optional, Tuple + +from libcst._add_slots import add_slots +from libcst._parser.parso.python.token import TokenType +from libcst._parser.types.whitespace_state import WhitespaceState + + +@add_slots +@dataclass(frozen=True) +class Token: + type: TokenType + string: str + # The start of where `string` is in the source, not including leading whitespace. + start_pos: Tuple[int, int] + # The end of where `string` is in the source, not including trailing whitespace. + end_pos: Tuple[int, int] + whitespace_before: WhitespaceState + whitespace_after: WhitespaceState + # The relative indent this token adds. + relative_indent: Optional[str] diff --git a/libcst/_parser/types/py_whitespace_state.py b/libcst/_parser/types/py_whitespace_state.py new file mode 100644 index 000000000..41244b98a --- /dev/null +++ b/libcst/_parser/types/py_whitespace_state.py @@ -0,0 +1,36 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass + +from libcst._add_slots import add_slots + + +@add_slots +@dataclass(frozen=False) +class WhitespaceState: + """ + A frequently mutated store of the whitespace parser's current state. This object + must be cloned prior to speculative parsing. + + This is in contrast to the `config` object each whitespace parser function takes, + which is frozen and never mutated. + + Whitespace parsing works by mutating this state object. By encapsulating saving, and + re-using state objects inside the top-level python parser, the whitespace parser is + able to be reentrant. One 'convert' function can consume part of the whitespace, and + another 'convert' function can consume the rest, depending on who owns what + whitespace. + + This is similar to the approach you might take to parse nested languages (e.g. + JavaScript inside of HTML). We're treating whitespace as a separate language and + grammar from the rest of Python's grammar. + """ + + line: int # one-indexed (to match parso's behavior) + column: int # zero-indexed (to match parso's behavior) + # What to look for when executing `_parse_indent`. + absolute_indent: str + is_parenthesized: bool diff --git a/libcst/_parser/types/token.py b/libcst/_parser/types/token.py index 60ddb2a2e..88d50b25f 100644 --- a/libcst/_parser/types/token.py +++ b/libcst/_parser/types/token.py @@ -4,24 +4,9 @@ # LICENSE file in the root directory of this source tree. -from dataclasses import dataclass -from typing import Optional, Tuple +try: + from libcst_native import tokenize -from libcst._add_slots import add_slots -from libcst._parser.parso.python.token import TokenType -from libcst._parser.types.whitespace_state import WhitespaceState - - -@add_slots -@dataclass(frozen=True) -class Token: - type: TokenType - string: str - # The start of where `string` is in the source, not including leading whitespace. - start_pos: Tuple[int, int] - # The end of where `string` is in the source, not including trailing whitespace. - end_pos: Tuple[int, int] - whitespace_before: WhitespaceState - whitespace_after: WhitespaceState - # The relative indent this token adds. - relative_indent: Optional[str] + Token = tokenize.Token +except ImportError: + from libcst._parser.types.py_token import Token # noqa F401 diff --git a/libcst/_parser/types/whitespace_state.py b/libcst/_parser/types/whitespace_state.py index b5554a2bc..a9798054c 100644 --- a/libcst/_parser/types/whitespace_state.py +++ b/libcst/_parser/types/whitespace_state.py @@ -7,34 +7,9 @@ Defines the state object used by the whitespace parser. """ -from dataclasses import dataclass +try: + from libcst_native import whitespace_state as mod +except ImportError: + from libcst._parser.types import py_whitespace_state as mod -from libcst._add_slots import add_slots - - -@add_slots -@dataclass(frozen=False) -class WhitespaceState: - """ - A frequently mutated store of the whitespace parser's current state. This object - must be cloned prior to speculative parsing. - - This is in contrast to the `config` object each whitespace parser function takes, - which is frozen and never mutated. - - Whitespace parsing works by mutating this state object. By encapsulating saving, and - re-using state objects inside the top-level python parser, the whitespace parser is - able to be reentrant. One 'convert' function can consume part of the whitespace, and - another 'convert' function can consume the rest, depending on who owns what - whitespace. - - This is similar to the approach you might take to parse nested languages (e.g. - JavaScript inside of HTML). We're treating whitespace as a separate language and - grammar from the rest of Python's grammar. - """ - - line: int # one-indexed (to match parso's behavior) - column: int # zero-indexed (to match parso's behavior) - # What to look for when executing `_parse_indent`. - absolute_indent: str - is_parenthesized: bool +WhitespaceState = mod.WhitespaceState diff --git a/libcst/_parser/whitespace_parser.py b/libcst/_parser/whitespace_parser.py index 27892a271..9ffb6a7dc 100644 --- a/libcst/_parser/whitespace_parser.py +++ b/libcst/_parser/whitespace_parser.py @@ -5,7 +5,7 @@ """ Parso doesn't attempt to parse (or even emit tokens for) whitespace or comments that -isn't syntatically important. Instead, we're just given the whitespace as a "prefix" of +aren't syntatically important. Instead, we're just given the whitespace as a "prefix" of the token. However, in our CST, whitespace is gathered into far more detailed objects than a simple @@ -15,258 +15,19 @@ hand-rolled recursive descent parser. """ -from typing import List, Optional, Sequence, Tuple, Union - -from libcst._nodes.whitespace import ( - COMMENT_RE, - NEWLINE_RE, - SIMPLE_WHITESPACE_RE, - Comment, - EmptyLine, - Newline, - ParenthesizedWhitespace, - SimpleWhitespace, - TrailingWhitespace, -) -from libcst._parser.types.config import BaseWhitespaceParserConfig -from libcst._parser.types.whitespace_state import WhitespaceState as State - -# BEGIN PARSER ENTRYPOINTS - - -def parse_simple_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> SimpleWhitespace: - # The match never fails because the pattern can match an empty string - lines = config.lines - # pyre-fixme[16]: Optional type has no attribute `group`. - ws_line = SIMPLE_WHITESPACE_RE.match(lines[state.line - 1], state.column).group(0) - ws_line_list = [ws_line] - while "\\" in ws_line: - # continuation character - state.line += 1 - state.column = 0 - ws_line = SIMPLE_WHITESPACE_RE.match(lines[state.line - 1], state.column).group( - 0 - ) - ws_line_list.append(ws_line) - - # TODO: we could special-case the common case where there's no continuation - # character to avoid list construction and joining. - - # once we've finished collecting continuation characters - state.column += len(ws_line) - return SimpleWhitespace("".join(ws_line_list)) - - -def parse_empty_lines( - config: BaseWhitespaceParserConfig, - state: State, - *, - override_absolute_indent: Optional[str] = None, -) -> Sequence[EmptyLine]: - # If override_absolute_indent is true, then we need to parse all lines up - # to and including the last line that is indented at our level. These all - # belong to the footer and not to the next line's leading_lines. All lines - # that have indent=False and come after the last line where indent=True - # do not belong to this node. - state_for_line = State( - state.line, state.column, state.absolute_indent, state.is_parenthesized - ) - lines: List[Tuple[State, EmptyLine]] = [] - while True: - el = _parse_empty_line( - config, state_for_line, override_absolute_indent=override_absolute_indent - ) - if el is None: - break - - # Store the updated state with the element we parsed. Then make a new state - # clone for the next element. - lines.append((state_for_line, el)) - state_for_line = State( - state_for_line.line, - state_for_line.column, - state.absolute_indent, - state.is_parenthesized, - ) - - if override_absolute_indent is not None: - # We need to find the last element that is indented, and then split the list - # at that point. - for i in range(len(lines) - 1, -1, -1): - if lines[i][1].indent: - lines = lines[: (i + 1)] - break - else: - # We didn't find any lines, throw them all away - lines = [] - - if lines: - # Update the state line and column to match the last line actually parsed. - final_state: State = lines[-1][0] - state.line = final_state.line - state.column = final_state.column - return [r[1] for r in lines] - - -def parse_trailing_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> TrailingWhitespace: - trailing_whitespace = _parse_trailing_whitespace(config, state) - if trailing_whitespace is None: - raise Exception( - "Internal Error: Failed to parse TrailingWhitespace. This should never " - + "happen because a TrailingWhitespace is never optional in the grammar, " - + "so this error should've been caught by parso first." - ) - return trailing_whitespace - - -def parse_parenthesizable_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> Union[SimpleWhitespace, ParenthesizedWhitespace]: - if state.is_parenthesized: - # First, try parenthesized (don't need speculation because it either - # parses or doesn't modify state). - parenthesized_whitespace = _parse_parenthesized_whitespace(config, state) - if parenthesized_whitespace is not None: - return parenthesized_whitespace - # Now, just parse and return a simple whitespace - return parse_simple_whitespace(config, state) - - -# END PARSER ENTRYPOINTS -# BEGIN PARSER INTERNAL PRODUCTIONS - - -def _parse_empty_line( - config: BaseWhitespaceParserConfig, - state: State, - *, - override_absolute_indent: Optional[str] = None, -) -> Optional[EmptyLine]: - # begin speculative parsing - speculative_state = State( - state.line, state.column, state.absolute_indent, state.is_parenthesized - ) - try: - indent = _parse_indent( - config, speculative_state, override_absolute_indent=override_absolute_indent - ) - except Exception: - # We aren't on a new line, speculative parsing failed - return None - whitespace = parse_simple_whitespace(config, speculative_state) - comment = _parse_comment(config, speculative_state) - newline = _parse_newline(config, speculative_state) - if newline is None: - # speculative parsing failed - return None - # speculative parsing succeeded - state.line = speculative_state.line - state.column = speculative_state.column - # don't need to copy absolute_indent/is_parenthesized because they don't change. - return EmptyLine(indent, whitespace, comment, newline) - - -def _parse_indent( - config: BaseWhitespaceParserConfig, - state: State, - *, - override_absolute_indent: Optional[str] = None, -) -> bool: - """ - Returns True if indentation was found, otherwise False. - """ - absolute_indent = ( - override_absolute_indent - if override_absolute_indent is not None - else state.absolute_indent - ) - line_str = config.lines[state.line - 1] - if state.column != 0: - if state.column == len(line_str) and state.line == len(config.lines): - # We're at EOF, treat this as a failed speculative parse - return False - raise Exception("Internal Error: Column should be 0 when parsing an indent.") - if line_str.startswith(absolute_indent, state.column): - state.column += len(absolute_indent) - return True - return False - - -def _parse_comment( - config: BaseWhitespaceParserConfig, state: State -) -> Optional[Comment]: - comment_match = COMMENT_RE.match(config.lines[state.line - 1], state.column) - if comment_match is None: - return None - comment = comment_match.group(0) - state.column += len(comment) - return Comment(comment) - - -def _parse_newline( - config: BaseWhitespaceParserConfig, state: State -) -> Optional[Newline]: - # begin speculative parsing - line_str = config.lines[state.line - 1] - newline_match = NEWLINE_RE.match(line_str, state.column) - if newline_match is not None: - # speculative parsing succeeded - newline_str = newline_match.group(0) - state.column += len(newline_str) - if state.column != len(line_str): - raise Exception("Internal Error: Found a newline, but it wasn't the EOL.") - if state.line < len(config.lines): - # this newline was the end of a line, and there's another line, - # therefore we should move to the next line - state.line += 1 - state.column = 0 - if newline_str == config.default_newline: - # Just inherit it from the Module instead of explicitly setting it. - return Newline() - else: - return Newline(newline_str) - else: # no newline was found, speculative parsing failed - return None - - -def _parse_trailing_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> Optional[TrailingWhitespace]: - # Begin speculative parsing - speculative_state = State( - state.line, state.column, state.absolute_indent, state.is_parenthesized - ) - whitespace = parse_simple_whitespace(config, speculative_state) - comment = _parse_comment(config, speculative_state) - newline = _parse_newline(config, speculative_state) - if newline is None: - # Speculative parsing failed - return None - # Speculative parsing succeeded - state.line = speculative_state.line - state.column = speculative_state.column - # don't need to copy absolute_indent/is_parenthesized because they don't change. - return TrailingWhitespace(whitespace, comment, newline) - - -def _parse_parenthesized_whitespace( - config: BaseWhitespaceParserConfig, state: State -) -> Optional[ParenthesizedWhitespace]: - first_line = _parse_trailing_whitespace(config, state) - if first_line is None: - # Speculative parsing failed - return None - empty_lines = () - while True: - empty_line = _parse_empty_line(config, state) - if empty_line is None: - # This isn't an empty line, so parse it below - break - empty_lines = empty_lines + (empty_line,) - indent = _parse_indent(config, state) - last_line = parse_simple_whitespace(config, state) - return ParenthesizedWhitespace(first_line, empty_lines, indent, last_line) +try: + # It'd be better to do `from libcst_native.whitespace_parser import *`, but we're + # blocked on https://github.com/PyO3/pyo3/issues/759 + # (which ultimately seems to be a limitation of how importlib works) + from libcst_native import whitespace_parser as mod +except ImportError: + from libcst._parser import py_whitespace_parser as mod + +# pyre-fixme[5]: There's no sane way to type these re-exports +parse_simple_whitespace = mod.parse_simple_whitespace +# pyre-fixme[5]: There's no sane way to type these re-exports +parse_empty_lines = mod.parse_empty_lines +# pyre-fixme[5]: There's no sane way to type these re-exports +parse_trailing_whitespace = mod.parse_trailing_whitespace +# pyre-fixme[5]: There's no sane way to type these re-exports +parse_parenthesizable_whitespace = mod.parse_parenthesizable_whitespace diff --git a/libcst/_parser/wrapped_tokenize.py b/libcst/_parser/wrapped_tokenize.py index d77ed68c7..ae86c9105 100644 --- a/libcst/_parser/wrapped_tokenize.py +++ b/libcst/_parser/wrapped_tokenize.py @@ -22,7 +22,7 @@ from dataclasses import dataclass, field from enum import Enum -from typing import Generator, List, Optional, Sequence +from typing import Generator, Iterator, List, Optional, Sequence from libcst._add_slots import add_slots from libcst._exceptions import ParserSyntaxError @@ -76,15 +76,30 @@ class _TokenizeState: ) -def tokenize( - code: str, version_info: PythonVersionInfo -) -> Generator[Token, None, None]: - lines = split_lines(code, keepends=True) - return tokenize_lines(lines, version_info) +def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]: + try: + from libcst_native import tokenize as native_tokenize + + return native_tokenize.tokenize(code) + except ImportError: + lines = split_lines(code, keepends=True) + return tokenize_lines(code, lines, version_info) def tokenize_lines( - lines: Sequence[str], version_info: PythonVersionInfo + code: str, lines: Sequence[str], version_info: PythonVersionInfo +) -> Iterator[Token]: + try: + from libcst_native import tokenize as native_tokenize + + # TODO: pass through version_info + return native_tokenize.tokenize(code) + except ImportError: + return tokenize_lines_py(code, lines, version_info) + + +def tokenize_lines_py( + code: str, lines: Sequence[str], version_info: PythonVersionInfo ) -> Generator[Token, None, None]: state = _TokenizeState(lines) orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info)) diff --git a/libcst/codegen/tests/test_codegen_clean.py b/libcst/codegen/tests/test_codegen_clean.py index dad5166e3..c2f18df2a 100644 --- a/libcst/codegen/tests/test_codegen_clean.py +++ b/libcst/codegen/tests/test_codegen_clean.py @@ -22,7 +22,7 @@ def test_codegen_clean_visitor_functions(self) -> None: """ new_code = clean_generated_code("\n".join(visitor_codegen.generated_code)) new_file = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "visitor_codegen.py.deleteme" + os.path.dirname(os.path.abspath(__file__)), "visitor_codegen.deleteme.py" ) with open(new_file, "w") as fp: fp.write(new_code) @@ -56,7 +56,7 @@ def test_codegen_clean_matcher_classes(self) -> None: """ new_code = clean_generated_code("\n".join(matcher_codegen.generated_code)) new_file = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "matcher_codegen.py.deleteme" + os.path.dirname(os.path.abspath(__file__)), "matcher_codegen.deleteme.py" ) with open(new_file, "w") as fp: fp.write(new_code) @@ -90,7 +90,7 @@ def test_codegen_clean_return_types(self) -> None: """ new_code = clean_generated_code("\n".join(type_codegen.generated_code)) new_file = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "type_codegen.py.deleteme" + os.path.dirname(os.path.abspath(__file__)), "type_codegen.deleteme.py" ) with open(new_file, "w") as fp: fp.write(new_code) diff --git a/libcst/codemod/tests/test_codemod_cli.py b/libcst/codemod/tests/test_codemod_cli.py index 7daa70564..f586ea19e 100644 --- a/libcst/codemod/tests/test_codemod_cli.py +++ b/libcst/codemod/tests/test_codemod_cli.py @@ -9,6 +9,7 @@ import sys from pathlib import Path +from libcst._parser.entrypoints import is_native from libcst.testing.utils import UnitTest @@ -27,7 +28,7 @@ def test_codemod_formatter_error_input(self) -> None: stderr=subprocess.PIPE, ) version = sys.version_info - if version[0] == 3 and version[1] == 6: + if version[0] == 3 and version[1] == 6 and not is_native(): self.assertIn( "ParserSyntaxError: Syntax Error @ 14:11.", rlt.stderr.decode("utf-8"), diff --git a/libcst/matchers/tests/test_extract.py b/libcst/matchers/tests/test_extract.py index 77c134a8a..298f3fec9 100644 --- a/libcst/matchers/tests/test_extract.py +++ b/libcst/matchers/tests/test_extract.py @@ -408,9 +408,11 @@ def test_extract_sequence_element(self) -> None: ] ), ) - extracted_seq = cst.ensure_type( - cst.ensure_type(expression, cst.Tuple).elements[1].value, cst.Call - ).args + extracted_seq = tuple( + cst.ensure_type( + cst.ensure_type(expression, cst.Tuple).elements[1].value, cst.Call + ).args + ) self.assertEqual(nodes, {"args": extracted_seq}) # Verify false behavior diff --git a/libcst/tests/test_e2e.py b/libcst/tests/test_e2e.py index 1e68ed6c3..1b836eab7 100644 --- a/libcst/tests/test_e2e.py +++ b/libcst/tests/test_e2e.py @@ -59,6 +59,8 @@ def test_leaky_codemod(self) -> None: hide_progress=True, ) + print(result) + # Check results self.assertEqual(2, result.successes) self.assertEqual(0, result.skips) diff --git a/native/Cargo.lock b/native/Cargo.lock new file mode 100644 index 000000000..92b17afe8 --- /dev/null +++ b/native/Cargo.lock @@ -0,0 +1,884 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "annotate-snippets" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7021ce4924a3f25f802b2cccd1af585e39ea1a363a1aa2e72afe54b67a3a7a7" + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" + +[[package]] +name = "bstr" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90682c8d613ad3373e66de8c6411e0ae2ab2571e879d2efbf73558cc66f21279" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" + +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chic" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5b5db619f3556839cb2223ae86ff3f9a09da2c5013be42bc9af08c9589bf70c" +dependencies = [ + "annotate-snippets", +] + +[[package]] +name = "clap" +version = "2.33.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +dependencies = [ + "bitflags", + "textwrap", + "unicode-width", +] + +[[package]] +name = "criterion" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab327ed7354547cc2ef43cbe20ef68b988e70b4b593cbd66a2a61733123a3d23" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools 0.10.1", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-cycles-per-byte" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d34485a578330c7a91ccf064674f3739a7aebbf3b9d7fd498a6d3e8f7473c96" +dependencies = [ + "criterion", +] + +[[package]] +name = "criterion-plot" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e022feadec601fba1649cfa83586381a4ad31c6bf3a9ab7d408118b05dd9889d" +dependencies = [ + "cast", + "itertools 0.9.0", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +dependencies = [ + "cfg-if", + "lazy_static", +] + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "difference" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198" + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "half" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "indoc" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47741a8bc60fb26eb8d6e0238bbb26d8575ff623fdc97b1a2c00c050b9684ed8" +dependencies = [ + "indoc-impl", + "proc-macro-hack", +] + +[[package]] +name = "indoc-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce046d161f000fffde5f432a0d034d0341dc152643b2598ed5bfce44c4f3a8f0" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", + "unindent", +] + +[[package]] +name = "instant" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee0328b1209d157ef001c94dd85b4f8f64139adb0eac2659f4b08382b2f474d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "itertools" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" + +[[package]] +name = "js-sys" +version = "0.3.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" + +[[package]] +name = "libcst" +version = "0.1.0" +dependencies = [ + "chic", + "criterion", + "criterion-cycles-per-byte", + "difference", + "itertools 0.10.1", + "libcst_derive", + "once_cell", + "paste 1.0.5", + "peg", + "pyo3", + "regex", + "thiserror", +] + +[[package]] +name = "libcst_derive" +version = "0.1.0" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "lock_api" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "memchr" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" + +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "parking_lot" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "paste" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45ca20c77d80be666aef2b45486da86238fabe33e38306bd3118fe4af33fa880" +dependencies = [ + "paste-impl", + "proc-macro-hack", +] + +[[package]] +name = "paste" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf547ad0c65e31259204bd90935776d1c693cec2f4ff7abb7a1bbbd40dfe58" + +[[package]] +name = "paste-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d95a7db200b97ef370c8e6de0088252f7e0dfff7d047a28528e47456c0fc98b6" +dependencies = [ + "proc-macro-hack", +] + +[[package]] +name = "peg" +version = "0.7.0" +source = "git+https://github.com/kevinmehall/rust-peg#4b146b4b78a80c07e43d7ace2d97f65bfde279a8" +dependencies = [ + "peg-macros", + "peg-runtime", +] + +[[package]] +name = "peg-macros" +version = "0.7.0" +source = "git+https://github.com/kevinmehall/rust-peg#4b146b4b78a80c07e43d7ace2d97f65bfde279a8" +dependencies = [ + "peg-runtime", + "proc-macro2", + "quote", +] + +[[package]] +name = "peg-runtime" +version = "0.7.0" +source = "git+https://github.com/kevinmehall/rust-peg#4b146b4b78a80c07e43d7ace2d97f65bfde279a8" + +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + +[[package]] +name = "proc-macro2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "pyo3" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35100f9347670a566a67aa623369293703322bb9db77d99d7df7313b575ae0c8" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "parking_lot", + "paste 0.1.18", + "pyo3-build-config", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d12961738cacbd7f91b7c43bc25cfeeaa2698ad07a04b3be0aa88b950865738f" +dependencies = [ + "once_cell", +] + +[[package]] +name = "pyo3-macros" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0bc5215d704824dfddddc03f93cb572e1155c68b6761c37005e1c288808ea8" +dependencies = [ + "pyo3-macros-backend", + "quote", + "syn", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71623fc593224afaab918aa3afcaf86ed2f43d34f6afde7f3922608f253240df" +dependencies = [ + "proc-macro2", + "pyo3-build-config", + "quote", + "syn", +] + +[[package]] +name = "quote" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "semver" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f3aac57ee7f3272d8395c6e4f502f434f0e289fcd62876f70daa008c20dcabe" + +[[package]] +name = "serde" +version = "1.0.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03" + +[[package]] +name = "serde_cbor" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "smallvec" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" + +[[package]] +name = "syn" +version = "1.0.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thiserror" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93119e4feac1cbe6c798c34d3a53ea0026b0b1de6a120deef895137c0529bfe2" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "060d69a0afe7796bf42e9e2ff91f5ee691fb15c53d38b4b62a9a53eb23164745" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "unicode-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "unindent" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f14ee04d9415b52b3aeab06258a3f07093182b88ba0f9b8d203f211a7a7d41c7" + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f" + +[[package]] +name = "web-sys" +version = "0.3.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/native/Cargo.toml b/native/Cargo.toml new file mode 100644 index 000000000..3a0b79f7e --- /dev/null +++ b/native/Cargo.toml @@ -0,0 +1,6 @@ +[workspace] + +members = [ + "libcst", + "libcst_derive", +] diff --git a/native/libcst/Cargo.toml b/native/libcst/Cargo.toml new file mode 100644 index 000000000..f009e9189 --- /dev/null +++ b/native/libcst/Cargo.toml @@ -0,0 +1,47 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +[package] +name = "libcst" +version = "0.1.0" +authors = ["LibCST Developers"] +edition = "2018" + +[lib] +name = "libcst_native" +crate-type = ["cdylib", "rlib"] + +[[bin]] +name = "parse" +path = "src/bin.rs" + +[features] +# This is a bit of a hack, since `cargo test` doesn't work with `extension-module`. +# To run tests, use `cargo test --no-default-features`. +# +# Once https://github.com/PyO3/pyo3/pull/1123 lands, it may be better to use +# `-Zextra-link-arg` for this instead. +default = ["pyo3/extension-module"] +trace = ["peg/trace"] + +[dependencies] +paste = "1.0.4" +pyo3 = "0.14.4" +thiserror = "1.0.23" +peg = { git = "https://github.com/kevinmehall/rust-peg" } +chic = "1.2.2" +itertools = "0.10.0" +once_cell = "1.5.2" +regex = "1.5.4" +libcst_derive = { path = "../libcst_derive" } + +[dev-dependencies] +criterion = { version = "0.3.4", features = ["html_reports"] } +criterion-cycles-per-byte = "0.1" +difference = "2.0.0" + +[[bench]] +name = "parser_benchmark" +harness = false diff --git a/native/libcst/Grammar b/native/libcst/Grammar new file mode 100644 index 000000000..274db7132 --- /dev/null +++ b/native/libcst/Grammar @@ -0,0 +1,707 @@ +# PEG grammar for Python 3.9 + +@trailer ''' +void * +_PyPegen_parse(Parser *p) +{ + // Initialize keywords + p->keywords = reserved_keywords; + p->n_keyword_lists = n_keyword_lists; + + // Run parser + void *result = NULL; + if (p->start_rule == Py_file_input) { + result = file_rule(p); + } else if (p->start_rule == Py_single_input) { + result = interactive_rule(p); + } else if (p->start_rule == Py_eval_input) { + result = eval_rule(p); + } else if (p->start_rule == Py_func_type_input) { + result = func_type_rule(p); + } else if (p->start_rule == Py_fstring_input) { + result = fstring_rule(p); + } + + return result; +} + +// The end +''' +file[mod_ty]: a=[statements] ENDMARKER { _PyPegen_make_module(p, a) } +interactive[mod_ty]: a=statement_newline { Interactive(a, p->arena) } +eval[mod_ty]: a=expressions NEWLINE* ENDMARKER { Expression(a, p->arena) } +func_type[mod_ty]: '(' a=[type_expressions] ')' '->' b=expression NEWLINE* ENDMARKER { FunctionType(a, b, p->arena) } +fstring[expr_ty]: star_expressions + +# type_expressions allow */** but ignore them +type_expressions[asdl_seq*]: + | a=','.expression+ ',' '*' b=expression ',' '**' c=expression { + _PyPegen_seq_append_to_end(p, CHECK(_PyPegen_seq_append_to_end(p, a, b)), c) } + | a=','.expression+ ',' '*' b=expression { _PyPegen_seq_append_to_end(p, a, b) } + | a=','.expression+ ',' '**' b=expression { _PyPegen_seq_append_to_end(p, a, b) } + | '*' a=expression ',' '**' b=expression { + _PyPegen_seq_append_to_end(p, CHECK(_PyPegen_singleton_seq(p, a)), b) } + | '*' a=expression { _PyPegen_singleton_seq(p, a) } + | '**' a=expression { _PyPegen_singleton_seq(p, a) } + | ','.expression+ + +statements[asdl_seq*]: a=statement+ { _PyPegen_seq_flatten(p, a) } +statement[asdl_seq*]: a=compound_stmt { _PyPegen_singleton_seq(p, a) } | simple_stmt +statement_newline[asdl_seq*]: + | a=compound_stmt NEWLINE { _PyPegen_singleton_seq(p, a) } + | simple_stmt + | NEWLINE { _PyPegen_singleton_seq(p, CHECK(_Py_Pass(EXTRA))) } + | ENDMARKER { _PyPegen_interactive_exit(p) } +simple_stmt[asdl_seq*]: + | a=small_stmt !';' NEWLINE { _PyPegen_singleton_seq(p, a) } # Not needed, there for speedup + | a=';'.small_stmt+ [';'] NEWLINE { a } +# NOTE: assignment MUST precede expression, else parsing a simple assignment +# will throw a SyntaxError. +small_stmt[stmt_ty] (memo): + | assignment + | e=star_expressions { _Py_Expr(e, EXTRA) } + | &'return' return_stmt + | &('import' | 'from') import_stmt + | &'raise' raise_stmt + | 'pass' { _Py_Pass(EXTRA) } + | &'del' del_stmt + | &'yield' yield_stmt + | &'assert' assert_stmt + | 'break' { _Py_Break(EXTRA) } + | 'continue' { _Py_Continue(EXTRA) } + | &'global' global_stmt + | &'nonlocal' nonlocal_stmt +compound_stmt[stmt_ty]: + | &('def' | '@' | ASYNC) function_def + | &'if' if_stmt + | &('class' | '@') class_def + | &('with' | ASYNC) with_stmt + | &('for' | ASYNC) for_stmt + | &'try' try_stmt + | &'while' while_stmt + +# NOTE: annotated_rhs may start with 'yield'; yield_expr must start with 'yield' +assignment[stmt_ty]: + | a=NAME ':' b=expression c=['=' d=annotated_rhs { d }] { + CHECK_VERSION( + 6, + "Variable annotation syntax is", + _Py_AnnAssign(CHECK(_PyPegen_set_expr_context(p, a, Store)), b, c, 1, EXTRA) + ) } + | a=('(' b=single_target ')' { b } + | single_subscript_attribute_target) ':' b=expression c=['=' d=annotated_rhs { d }] { + CHECK_VERSION(6, "Variable annotations syntax is", _Py_AnnAssign(a, b, c, 0, EXTRA)) } + | a=(z=star_targets '=' { z })+ b=(yield_expr | star_expressions) !'=' tc=[TYPE_COMMENT] { + _Py_Assign(a, b, NEW_TYPE_COMMENT(p, tc), EXTRA) } + | a=single_target b=augassign ~ c=(yield_expr | star_expressions) { + _Py_AugAssign(a, b->kind, c, EXTRA) } + | invalid_assignment + +augassign[AugOperator*]: + | '+=' { _PyPegen_augoperator(p, Add) } + | '-=' { _PyPegen_augoperator(p, Sub) } + | '*=' { _PyPegen_augoperator(p, Mult) } + | '@=' { CHECK_VERSION(5, "The '@' operator is", _PyPegen_augoperator(p, MatMult)) } + | '/=' { _PyPegen_augoperator(p, Div) } + | '%=' { _PyPegen_augoperator(p, Mod) } + | '&=' { _PyPegen_augoperator(p, BitAnd) } + | '|=' { _PyPegen_augoperator(p, BitOr) } + | '^=' { _PyPegen_augoperator(p, BitXor) } + | '<<=' { _PyPegen_augoperator(p, LShift) } + | '>>=' { _PyPegen_augoperator(p, RShift) } + | '**=' { _PyPegen_augoperator(p, Pow) } + | '//=' { _PyPegen_augoperator(p, FloorDiv) } + +global_stmt[stmt_ty]: 'global' a=','.NAME+ { + _Py_Global(CHECK(_PyPegen_map_names_to_ids(p, a)), EXTRA) } +nonlocal_stmt[stmt_ty]: 'nonlocal' a=','.NAME+ { + _Py_Nonlocal(CHECK(_PyPegen_map_names_to_ids(p, a)), EXTRA) } + +yield_stmt[stmt_ty]: y=yield_expr { _Py_Expr(y, EXTRA) } + +assert_stmt[stmt_ty]: 'assert' a=expression b=[',' z=expression { z }] { _Py_Assert(a, b, EXTRA) } + +del_stmt[stmt_ty]: + | 'del' a=del_targets &(';' | NEWLINE) { _Py_Delete(a, EXTRA) } + | invalid_del_stmt + +import_stmt[stmt_ty]: import_name | import_from +import_name[stmt_ty]: 'import' a=dotted_as_names { _Py_Import(a, EXTRA) } +# note below: the ('.' | '...') is necessary because '...' is tokenized as ELLIPSIS +import_from[stmt_ty]: + | 'from' a=('.' | '...')* b=dotted_name 'import' c=import_from_targets { + _Py_ImportFrom(b->v.Name.id, c, _PyPegen_seq_count_dots(a), EXTRA) } + | 'from' a=('.' | '...')+ 'import' b=import_from_targets { + _Py_ImportFrom(NULL, b, _PyPegen_seq_count_dots(a), EXTRA) } +import_from_targets[asdl_seq*]: + | '(' a=import_from_as_names [','] ')' { a } + | import_from_as_names !',' + | '*' { _PyPegen_singleton_seq(p, CHECK(_PyPegen_alias_for_star(p))) } + | invalid_import_from_targets +import_from_as_names[asdl_seq*]: + | a=','.import_from_as_name+ { a } +import_from_as_name[alias_ty]: + | a=NAME b=['as' z=NAME { z }] { _Py_alias(a->v.Name.id, + (b) ? ((expr_ty) b)->v.Name.id : NULL, + p->arena) } +dotted_as_names[asdl_seq*]: + | a=','.dotted_as_name+ { a } +dotted_as_name[alias_ty]: + | a=dotted_name b=['as' z=NAME { z }] { _Py_alias(a->v.Name.id, + (b) ? ((expr_ty) b)->v.Name.id : NULL, + p->arena) } +dotted_name[expr_ty]: + | a=dotted_name '.' b=NAME { _PyPegen_join_names_with_dot(p, a, b) } + | NAME + +if_stmt[stmt_ty]: + | 'if' a=named_expression ':' b=block c=elif_stmt { _Py_If(a, b, CHECK(_PyPegen_singleton_seq(p, c)), EXTRA) } + | 'if' a=named_expression ':' b=block c=[else_block] { _Py_If(a, b, c, EXTRA) } +elif_stmt[stmt_ty]: + | 'elif' a=named_expression ':' b=block c=elif_stmt { _Py_If(a, b, CHECK(_PyPegen_singleton_seq(p, c)), EXTRA) } + | 'elif' a=named_expression ':' b=block c=[else_block] { _Py_If(a, b, c, EXTRA) } +else_block[asdl_seq*]: 'else' ':' b=block { b } + +while_stmt[stmt_ty]: + | 'while' a=named_expression ':' b=block c=[else_block] { _Py_While(a, b, c, EXTRA) } + +for_stmt[stmt_ty]: + | 'for' t=star_targets 'in' ~ ex=star_expressions ':' tc=[TYPE_COMMENT] b=block el=[else_block] { + _Py_For(t, ex, b, el, NEW_TYPE_COMMENT(p, tc), EXTRA) } + | ASYNC 'for' t=star_targets 'in' ~ ex=star_expressions ':' tc=[TYPE_COMMENT] b=block el=[else_block] { + CHECK_VERSION(5, "Async for loops are", _Py_AsyncFor(t, ex, b, el, NEW_TYPE_COMMENT(p, tc), EXTRA)) } + | invalid_for_target + +with_stmt[stmt_ty]: + | 'with' '(' a=','.with_item+ ','? ')' ':' b=block { + _Py_With(a, b, NULL, EXTRA) } + | 'with' a=','.with_item+ ':' tc=[TYPE_COMMENT] b=block { + _Py_With(a, b, NEW_TYPE_COMMENT(p, tc), EXTRA) } + | ASYNC 'with' '(' a=','.with_item+ ','? ')' ':' b=block { + CHECK_VERSION(5, "Async with statements are", _Py_AsyncWith(a, b, NULL, EXTRA)) } + | ASYNC 'with' a=','.with_item+ ':' tc=[TYPE_COMMENT] b=block { + CHECK_VERSION(5, "Async with statements are", _Py_AsyncWith(a, b, NEW_TYPE_COMMENT(p, tc), EXTRA)) } +with_item[withitem_ty]: + | e=expression 'as' t=star_target &(',' | ')' | ':') { _Py_withitem(e, t, p->arena) } + | invalid_with_item + | e=expression { _Py_withitem(e, NULL, p->arena) } + +try_stmt[stmt_ty]: + | 'try' ':' b=block f=finally_block { _Py_Try(b, NULL, NULL, f, EXTRA) } + | 'try' ':' b=block ex=except_block+ el=[else_block] f=[finally_block] { _Py_Try(b, ex, el, f, EXTRA) } +except_block[excepthandler_ty]: + | 'except' e=expression t=['as' z=NAME { z }] ':' b=block { + _Py_ExceptHandler(e, (t) ? ((expr_ty) t)->v.Name.id : NULL, b, EXTRA) } + | 'except' ':' b=block { _Py_ExceptHandler(NULL, NULL, b, EXTRA) } +finally_block[asdl_seq*]: 'finally' ':' a=block { a } + +return_stmt[stmt_ty]: + | 'return' a=[star_expressions] { _Py_Return(a, EXTRA) } + +raise_stmt[stmt_ty]: + | 'raise' a=expression b=['from' z=expression { z }] { _Py_Raise(a, b, EXTRA) } + | 'raise' { _Py_Raise(NULL, NULL, EXTRA) } + +function_def[stmt_ty]: + | d=decorators f=function_def_raw { _PyPegen_function_def_decorators(p, d, f) } + | function_def_raw + +function_def_raw[stmt_ty]: + | 'def' n=NAME '(' params=[params] ')' a=['->' z=expression { z }] ':' tc=[func_type_comment] b=block { + _Py_FunctionDef(n->v.Name.id, + (params) ? params : CHECK(_PyPegen_empty_arguments(p)), + b, NULL, a, NEW_TYPE_COMMENT(p, tc), EXTRA) } + | ASYNC 'def' n=NAME '(' params=[params] ')' a=['->' z=expression { z }] ':' tc=[func_type_comment] b=block { + CHECK_VERSION( + 5, + "Async functions are", + _Py_AsyncFunctionDef(n->v.Name.id, + (params) ? params : CHECK(_PyPegen_empty_arguments(p)), + b, NULL, a, NEW_TYPE_COMMENT(p, tc), EXTRA) + ) } +func_type_comment[Token*]: + | NEWLINE t=TYPE_COMMENT &(NEWLINE INDENT) { t } # Must be followed by indented block + | invalid_double_type_comments + | TYPE_COMMENT + +params[arguments_ty]: + | invalid_parameters + | parameters + +parameters[arguments_ty]: + | a=slash_no_default b=param_no_default* c=param_with_default* d=[star_etc] { + _PyPegen_make_arguments(p, a, NULL, b, c, d) } + | a=slash_with_default b=param_with_default* c=[star_etc] { + _PyPegen_make_arguments(p, NULL, a, NULL, b, c) } + | a=param_no_default+ b=param_with_default* c=[star_etc] { + _PyPegen_make_arguments(p, NULL, NULL, a, b, c) } + | a=param_with_default+ b=[star_etc] { _PyPegen_make_arguments(p, NULL, NULL, NULL, a, b)} + | a=star_etc { _PyPegen_make_arguments(p, NULL, NULL, NULL, NULL, a) } + +# Some duplication here because we can't write (',' | &')'), +# which is because we don't support empty alternatives (yet). +# +slash_no_default[asdl_seq*]: + | a=param_no_default+ '/' ',' { a } + | a=param_no_default+ '/' &')' { a } +slash_with_default[SlashWithDefault*]: + | a=param_no_default* b=param_with_default+ '/' ',' { _PyPegen_slash_with_default(p, a, b) } + | a=param_no_default* b=param_with_default+ '/' &')' { _PyPegen_slash_with_default(p, a, b) } + +star_etc[StarEtc*]: + | '*' a=param_no_default b=param_maybe_default* c=[kwds] { + _PyPegen_star_etc(p, a, b, c) } + | '*' ',' b=param_maybe_default+ c=[kwds] { + _PyPegen_star_etc(p, NULL, b, c) } + | a=kwds { _PyPegen_star_etc(p, NULL, NULL, a) } + | invalid_star_etc + +kwds[arg_ty]: '**' a=param_no_default { a } + +# One parameter. This *includes* a following comma and type comment. +# +# There are three styles: +# - No default +# - With default +# - Maybe with default +# +# There are two alternative forms of each, to deal with type comments: +# - Ends in a comma followed by an optional type comment +# - No comma, optional type comment, must be followed by close paren +# The latter form is for a final parameter without trailing comma. +# +param_no_default[arg_ty]: + | a=param ',' tc=TYPE_COMMENT? { _PyPegen_add_type_comment_to_arg(p, a, tc) } + | a=param tc=TYPE_COMMENT? &')' { _PyPegen_add_type_comment_to_arg(p, a, tc) } +param_with_default[NameDefaultPair*]: + | a=param c=default ',' tc=TYPE_COMMENT? { _PyPegen_name_default_pair(p, a, c, tc) } + | a=param c=default tc=TYPE_COMMENT? &')' { _PyPegen_name_default_pair(p, a, c, tc) } +param_maybe_default[NameDefaultPair*]: + | a=param c=default? ',' tc=TYPE_COMMENT? { _PyPegen_name_default_pair(p, a, c, tc) } + | a=param c=default? tc=TYPE_COMMENT? &')' { _PyPegen_name_default_pair(p, a, c, tc) } +param[arg_ty]: a=NAME b=annotation? { _Py_arg(a->v.Name.id, b, NULL, EXTRA) } + +annotation[expr_ty]: ':' a=expression { a } +default[expr_ty]: '=' a=expression { a } + +decorators[asdl_seq*]: a=('@' f=named_expression NEWLINE { f })+ { a } + +class_def[stmt_ty]: + | a=decorators b=class_def_raw { _PyPegen_class_def_decorators(p, a, b) } + | class_def_raw +class_def_raw[stmt_ty]: + | 'class' a=NAME b=['(' z=[arguments] ')' { z }] ':' c=block { + _Py_ClassDef(a->v.Name.id, + (b) ? ((expr_ty) b)->v.Call.args : NULL, + (b) ? ((expr_ty) b)->v.Call.keywords : NULL, + c, NULL, EXTRA) } + +block[asdl_seq*] (memo): + | NEWLINE INDENT a=statements DEDENT { a } + | simple_stmt + | invalid_block + +star_expressions[expr_ty]: + | a=star_expression b=(',' c=star_expression { c })+ [','] { + _Py_Tuple(CHECK(_PyPegen_seq_insert_in_front(p, a, b)), Load, EXTRA) } + | a=star_expression ',' { _Py_Tuple(CHECK(_PyPegen_singleton_seq(p, a)), Load, EXTRA) } + | star_expression +star_expression[expr_ty] (memo): + | '*' a=bitwise_or { _Py_Starred(a, Load, EXTRA) } + | expression + +star_named_expressions[asdl_seq*]: a=','.star_named_expression+ [','] { a } +star_named_expression[expr_ty]: + | '*' a=bitwise_or { _Py_Starred(a, Load, EXTRA) } + | named_expression +named_expression[expr_ty]: + | a=NAME ':=' ~ b=expression { _Py_NamedExpr(CHECK(_PyPegen_set_expr_context(p, a, Store)), b, EXTRA) } + | expression !':=' + | invalid_named_expression + +annotated_rhs[expr_ty]: yield_expr | star_expressions + +expressions[expr_ty]: + | a=expression b=(',' c=expression { c })+ [','] { + _Py_Tuple(CHECK(_PyPegen_seq_insert_in_front(p, a, b)), Load, EXTRA) } + | a=expression ',' { _Py_Tuple(CHECK(_PyPegen_singleton_seq(p, a)), Load, EXTRA) } + | expression +expression[expr_ty] (memo): + | a=disjunction 'if' b=disjunction 'else' c=expression { _Py_IfExp(b, a, c, EXTRA) } + | disjunction + | lambdef + +lambdef[expr_ty]: + | 'lambda' a=[lambda_params] ':' b=expression { _Py_Lambda((a) ? a : CHECK(_PyPegen_empty_arguments(p)), b, EXTRA) } + +lambda_params[arguments_ty]: + | invalid_lambda_parameters + | lambda_parameters + +# lambda_parameters etc. duplicates parameters but without annotations +# or type comments, and if there's no comma after a parameter, we expect +# a colon, not a close parenthesis. (For more, see parameters above.) +# +lambda_parameters[arguments_ty]: + | a=lambda_slash_no_default b=lambda_param_no_default* c=lambda_param_with_default* d=[lambda_star_etc] { + _PyPegen_make_arguments(p, a, NULL, b, c, d) } + | a=lambda_slash_with_default b=lambda_param_with_default* c=[lambda_star_etc] { + _PyPegen_make_arguments(p, NULL, a, NULL, b, c) } + | a=lambda_param_no_default+ b=lambda_param_with_default* c=[lambda_star_etc] { + _PyPegen_make_arguments(p, NULL, NULL, a, b, c) } + | a=lambda_param_with_default+ b=[lambda_star_etc] { _PyPegen_make_arguments(p, NULL, NULL, NULL, a, b)} + | a=lambda_star_etc { _PyPegen_make_arguments(p, NULL, NULL, NULL, NULL, a) } + +lambda_slash_no_default[asdl_seq*]: + | a=lambda_param_no_default+ '/' ',' { a } + | a=lambda_param_no_default+ '/' &':' { a } +lambda_slash_with_default[SlashWithDefault*]: + | a=lambda_param_no_default* b=lambda_param_with_default+ '/' ',' { _PyPegen_slash_with_default(p, a, b) } + | a=lambda_param_no_default* b=lambda_param_with_default+ '/' &':' { _PyPegen_slash_with_default(p, a, b) } + +lambda_star_etc[StarEtc*]: + | '*' a=lambda_param_no_default b=lambda_param_maybe_default* c=[lambda_kwds] { + _PyPegen_star_etc(p, a, b, c) } + | '*' ',' b=lambda_param_maybe_default+ c=[lambda_kwds] { + _PyPegen_star_etc(p, NULL, b, c) } + | a=lambda_kwds { _PyPegen_star_etc(p, NULL, NULL, a) } + | invalid_lambda_star_etc + +lambda_kwds[arg_ty]: '**' a=lambda_param_no_default { a } + +lambda_param_no_default[arg_ty]: + | a=lambda_param ',' { a } + | a=lambda_param &':' { a } +lambda_param_with_default[NameDefaultPair*]: + | a=lambda_param c=default ',' { _PyPegen_name_default_pair(p, a, c, NULL) } + | a=lambda_param c=default &':' { _PyPegen_name_default_pair(p, a, c, NULL) } +lambda_param_maybe_default[NameDefaultPair*]: + | a=lambda_param c=default? ',' { _PyPegen_name_default_pair(p, a, c, NULL) } + | a=lambda_param c=default? &':' { _PyPegen_name_default_pair(p, a, c, NULL) } +lambda_param[arg_ty]: a=NAME { _Py_arg(a->v.Name.id, NULL, NULL, EXTRA) } + +disjunction[expr_ty] (memo): + | a=conjunction b=('or' c=conjunction { c })+ { _Py_BoolOp( + Or, + CHECK(_PyPegen_seq_insert_in_front(p, a, b)), + EXTRA) } + | conjunction +conjunction[expr_ty] (memo): + | a=inversion b=('and' c=inversion { c })+ { _Py_BoolOp( + And, + CHECK(_PyPegen_seq_insert_in_front(p, a, b)), + EXTRA) } + | inversion +inversion[expr_ty] (memo): + | 'not' a=inversion { _Py_UnaryOp(Not, a, EXTRA) } + | comparison +comparison[expr_ty]: + | a=bitwise_or b=compare_op_bitwise_or_pair+ { + _Py_Compare(a, CHECK(_PyPegen_get_cmpops(p, b)), CHECK(_PyPegen_get_exprs(p, b)), EXTRA) } + | bitwise_or +compare_op_bitwise_or_pair[CmpopExprPair*]: + | eq_bitwise_or + | noteq_bitwise_or + | lte_bitwise_or + | lt_bitwise_or + | gte_bitwise_or + | gt_bitwise_or + | notin_bitwise_or + | in_bitwise_or + | isnot_bitwise_or + | is_bitwise_or +eq_bitwise_or[CmpopExprPair*]: '==' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, Eq, a) } +noteq_bitwise_or[CmpopExprPair*]: + | (tok='!=' { _PyPegen_check_barry_as_flufl(p, tok) ? NULL : tok}) a=bitwise_or {_PyPegen_cmpop_expr_pair(p, NotEq, a) } +lte_bitwise_or[CmpopExprPair*]: '<=' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, LtE, a) } +lt_bitwise_or[CmpopExprPair*]: '<' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, Lt, a) } +gte_bitwise_or[CmpopExprPair*]: '>=' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, GtE, a) } +gt_bitwise_or[CmpopExprPair*]: '>' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, Gt, a) } +notin_bitwise_or[CmpopExprPair*]: 'not' 'in' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, NotIn, a) } +in_bitwise_or[CmpopExprPair*]: 'in' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, In, a) } +isnot_bitwise_or[CmpopExprPair*]: 'is' 'not' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, IsNot, a) } +is_bitwise_or[CmpopExprPair*]: 'is' a=bitwise_or { _PyPegen_cmpop_expr_pair(p, Is, a) } + +bitwise_or[expr_ty]: + | a=bitwise_or '|' b=bitwise_xor { _Py_BinOp(a, BitOr, b, EXTRA) } + | bitwise_xor +bitwise_xor[expr_ty]: + | a=bitwise_xor '^' b=bitwise_and { _Py_BinOp(a, BitXor, b, EXTRA) } + | bitwise_and +bitwise_and[expr_ty]: + | a=bitwise_and '&' b=shift_expr { _Py_BinOp(a, BitAnd, b, EXTRA) } + | shift_expr +shift_expr[expr_ty]: + | a=shift_expr '<<' b=sum { _Py_BinOp(a, LShift, b, EXTRA) } + | a=shift_expr '>>' b=sum { _Py_BinOp(a, RShift, b, EXTRA) } + | sum + +sum[expr_ty]: + | a=sum '+' b=term { _Py_BinOp(a, Add, b, EXTRA) } + | a=sum '-' b=term { _Py_BinOp(a, Sub, b, EXTRA) } + | term +term[expr_ty]: + | a=term '*' b=factor { _Py_BinOp(a, Mult, b, EXTRA) } + | a=term '/' b=factor { _Py_BinOp(a, Div, b, EXTRA) } + | a=term '//' b=factor { _Py_BinOp(a, FloorDiv, b, EXTRA) } + | a=term '%' b=factor { _Py_BinOp(a, Mod, b, EXTRA) } + | a=term '@' b=factor { CHECK_VERSION(5, "The '@' operator is", _Py_BinOp(a, MatMult, b, EXTRA)) } + | factor +factor[expr_ty] (memo): + | '+' a=factor { _Py_UnaryOp(UAdd, a, EXTRA) } + | '-' a=factor { _Py_UnaryOp(USub, a, EXTRA) } + | '~' a=factor { _Py_UnaryOp(Invert, a, EXTRA) } + | power +power[expr_ty]: + | a=await_primary '**' b=factor { _Py_BinOp(a, Pow, b, EXTRA) } + | await_primary +await_primary[expr_ty] (memo): + | AWAIT a=primary { CHECK_VERSION(5, "Await expressions are", _Py_Await(a, EXTRA)) } + | primary +primary[expr_ty]: + | invalid_primary # must be before 'primay genexp' because of invalid_genexp + | a=primary '.' b=NAME { _Py_Attribute(a, b->v.Name.id, Load, EXTRA) } + | a=primary b=genexp { _Py_Call(a, CHECK(_PyPegen_singleton_seq(p, b)), NULL, EXTRA) } + | a=primary '(' b=[arguments] ')' { + _Py_Call(a, + (b) ? ((expr_ty) b)->v.Call.args : NULL, + (b) ? ((expr_ty) b)->v.Call.keywords : NULL, + EXTRA) } + | a=primary '[' b=slices ']' { _Py_Subscript(a, b, Load, EXTRA) } + | atom + +slices[expr_ty]: + | a=slice !',' { a } + | a=','.slice+ [','] { _Py_Tuple(a, Load, EXTRA) } +slice[expr_ty]: + | a=[expression] ':' b=[expression] c=[':' d=[expression] { d }] { _Py_Slice(a, b, c, EXTRA) } + | a=expression { a } +atom[expr_ty]: + | NAME + | 'True' { _Py_Constant(Py_True, NULL, EXTRA) } + | 'False' { _Py_Constant(Py_False, NULL, EXTRA) } + | 'None' { _Py_Constant(Py_None, NULL, EXTRA) } + | '__peg_parser__' { RAISE_SYNTAX_ERROR("You found it!") } + | &STRING strings + | NUMBER + | &'(' (tuple | group | genexp) + | &'[' (list | listcomp) + | &'{' (dict | set | dictcomp | setcomp) + | '...' { _Py_Constant(Py_Ellipsis, NULL, EXTRA) } + +strings[expr_ty] (memo): a=STRING+ { _PyPegen_concatenate_strings(p, a) } +list[expr_ty]: + | '[' a=[star_named_expressions] ']' { _Py_List(a, Load, EXTRA) } +listcomp[expr_ty]: + | '[' a=named_expression ~ b=for_if_clauses ']' { _Py_ListComp(a, b, EXTRA) } + | invalid_comprehension +tuple[expr_ty]: + | '(' a=[y=star_named_expression ',' z=[star_named_expressions] { _PyPegen_seq_insert_in_front(p, y, z) } ] ')' { + _Py_Tuple(a, Load, EXTRA) } +group[expr_ty]: + | '(' a=(yield_expr | named_expression) ')' { a } + | invalid_group +genexp[expr_ty]: + | '(' a=named_expression ~ b=for_if_clauses ')' { _Py_GeneratorExp(a, b, EXTRA) } + | invalid_comprehension +set[expr_ty]: '{' a=star_named_expressions '}' { _Py_Set(a, EXTRA) } +setcomp[expr_ty]: + | '{' a=named_expression ~ b=for_if_clauses '}' { _Py_SetComp(a, b, EXTRA) } + | invalid_comprehension +dict[expr_ty]: + | '{' a=[double_starred_kvpairs] '}' { + _Py_Dict(CHECK(_PyPegen_get_keys(p, a)), CHECK(_PyPegen_get_values(p, a)), EXTRA) } +dictcomp[expr_ty]: + | '{' a=kvpair b=for_if_clauses '}' { _Py_DictComp(a->key, a->value, b, EXTRA) } + | invalid_dict_comprehension +double_starred_kvpairs[asdl_seq*]: a=','.double_starred_kvpair+ [','] { a } +double_starred_kvpair[KeyValuePair*]: + | '**' a=bitwise_or { _PyPegen_key_value_pair(p, NULL, a) } + | kvpair +kvpair[KeyValuePair*]: a=expression ':' b=expression { _PyPegen_key_value_pair(p, a, b) } +for_if_clauses[asdl_seq*]: + | for_if_clause+ +for_if_clause[comprehension_ty]: + | ASYNC 'for' a=star_targets 'in' ~ b=disjunction c=('if' z=disjunction { z })* { + CHECK_VERSION(6, "Async comprehensions are", _Py_comprehension(a, b, c, 1, p->arena)) } + | 'for' a=star_targets 'in' ~ b=disjunction c=('if' z=disjunction { z })* { + _Py_comprehension(a, b, c, 0, p->arena) } + | invalid_for_target + +yield_expr[expr_ty]: + | 'yield' 'from' a=expression { _Py_YieldFrom(a, EXTRA) } + | 'yield' a=[star_expressions] { _Py_Yield(a, EXTRA) } + +arguments[expr_ty] (memo): + | a=args [','] &')' { a } + | invalid_arguments +args[expr_ty]: + | a=','.(starred_expression | named_expression !'=')+ b=[',' k=kwargs {k}] { _PyPegen_collect_call_seqs(p, a, b, EXTRA) } + | a=kwargs { _Py_Call(_PyPegen_dummy_name(p), + CHECK_NULL_ALLOWED(_PyPegen_seq_extract_starred_exprs(p, a)), + CHECK_NULL_ALLOWED(_PyPegen_seq_delete_starred_exprs(p, a)), + EXTRA) } +kwargs[asdl_seq*]: + | a=','.kwarg_or_starred+ ',' b=','.kwarg_or_double_starred+ { _PyPegen_join_sequences(p, a, b) } + | ','.kwarg_or_starred+ + | ','.kwarg_or_double_starred+ +starred_expression[expr_ty]: + | '*' a=expression { _Py_Starred(a, Load, EXTRA) } +kwarg_or_starred[KeywordOrStarred*]: + | a=NAME '=' b=expression { + _PyPegen_keyword_or_starred(p, CHECK(_Py_keyword(a->v.Name.id, b, EXTRA)), 1) } + | a=starred_expression { _PyPegen_keyword_or_starred(p, a, 0) } + | invalid_kwarg +kwarg_or_double_starred[KeywordOrStarred*]: + | a=NAME '=' b=expression { + _PyPegen_keyword_or_starred(p, CHECK(_Py_keyword(a->v.Name.id, b, EXTRA)), 1) } + | '**' a=expression { _PyPegen_keyword_or_starred(p, CHECK(_Py_keyword(NULL, a, EXTRA)), 1) } + | invalid_kwarg + +# NOTE: star_targets may contain *bitwise_or, targets may not. +star_targets[expr_ty]: + | a=star_target !',' { a } + | a=star_target b=(',' c=star_target { c })* [','] { + _Py_Tuple(CHECK(_PyPegen_seq_insert_in_front(p, a, b)), Store, EXTRA) } +star_targets_list_seq[asdl_seq*]: a=','.star_target+ [','] { a } +star_targets_tuple_seq[asdl_seq*]: + | a=star_target b=(',' c=star_target { c })+ [','] { _PyPegen_seq_insert_in_front(p, a, b) } + | a=star_target ',' { _PyPegen_singleton_seq(p, a) } +star_target[expr_ty] (memo): + | '*' a=(!'*' star_target) { + _Py_Starred(CHECK(_PyPegen_set_expr_context(p, a, Store)), Store, EXTRA) } + | target_with_star_atom +target_with_star_atom[expr_ty] (memo): + | a=t_primary '.' b=NAME !t_lookahead { _Py_Attribute(a, b->v.Name.id, Store, EXTRA) } + | a=t_primary '[' b=slices ']' !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } + | star_atom +star_atom[expr_ty]: + | a=NAME { _PyPegen_set_expr_context(p, a, Store) } + | '(' a=target_with_star_atom ')' { _PyPegen_set_expr_context(p, a, Store) } + | '(' a=[star_targets_tuple_seq] ')' { _Py_Tuple(a, Store, EXTRA) } + | '[' a=[star_targets_list_seq] ']' { _Py_List(a, Store, EXTRA) } + +single_target[expr_ty]: + | single_subscript_attribute_target + | a=NAME { _PyPegen_set_expr_context(p, a, Store) } + | '(' a=single_target ')' { a } +single_subscript_attribute_target[expr_ty]: + | a=t_primary '.' b=NAME !t_lookahead { _Py_Attribute(a, b->v.Name.id, Store, EXTRA) } + | a=t_primary '[' b=slices ']' !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } + +del_targets[asdl_seq*]: a=','.del_target+ [','] { a } +del_target[expr_ty] (memo): + | a=t_primary '.' b=NAME !t_lookahead { _Py_Attribute(a, b->v.Name.id, Del, EXTRA) } + | a=t_primary '[' b=slices ']' !t_lookahead { _Py_Subscript(a, b, Del, EXTRA) } + | del_t_atom +del_t_atom[expr_ty]: + | a=NAME { _PyPegen_set_expr_context(p, a, Del) } + | '(' a=del_target ')' { _PyPegen_set_expr_context(p, a, Del) } + | '(' a=[del_targets] ')' { _Py_Tuple(a, Del, EXTRA) } + | '[' a=[del_targets] ']' { _Py_List(a, Del, EXTRA) } + +targets[asdl_seq*]: a=','.target+ [','] { a } +target[expr_ty] (memo): + | a=t_primary '.' b=NAME !t_lookahead { _Py_Attribute(a, b->v.Name.id, Store, EXTRA) } + | a=t_primary '[' b=slices ']' !t_lookahead { _Py_Subscript(a, b, Store, EXTRA) } + | t_atom +t_primary[expr_ty]: + | a=t_primary '.' b=NAME &t_lookahead { _Py_Attribute(a, b->v.Name.id, Load, EXTRA) } + | a=t_primary '[' b=slices ']' &t_lookahead { _Py_Subscript(a, b, Load, EXTRA) } + | a=t_primary b=genexp &t_lookahead { _Py_Call(a, CHECK(_PyPegen_singleton_seq(p, b)), NULL, EXTRA) } + | a=t_primary '(' b=[arguments] ')' &t_lookahead { + _Py_Call(a, + (b) ? ((expr_ty) b)->v.Call.args : NULL, + (b) ? ((expr_ty) b)->v.Call.keywords : NULL, + EXTRA) } + | a=atom &t_lookahead { a } +t_lookahead: '(' | '[' | '.' +t_atom[expr_ty]: + | a=NAME { _PyPegen_set_expr_context(p, a, Store) } + | '(' a=target ')' { _PyPegen_set_expr_context(p, a, Store) } + | '(' b=[targets] ')' { _Py_Tuple(b, Store, EXTRA) } + | '[' b=[targets] ']' { _Py_List(b, Store, EXTRA) } + + +# From here on, there are rules for invalid syntax with specialised error messages +invalid_arguments: + | args ',' '*' { RAISE_SYNTAX_ERROR("iterable argument unpacking follows keyword argument unpacking") } + | a=expression for_if_clauses ',' [args | expression for_if_clauses] { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "Generator expression must be parenthesized") } + | a=args for_if_clauses { _PyPegen_nonparen_genexp_in_call(p, a) } + | args ',' a=expression for_if_clauses { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "Generator expression must be parenthesized") } + | a=args ',' args { _PyPegen_arguments_parsing_error(p, a) } +invalid_kwarg: + | !(NAME '=') a=expression b='=' { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + a, "expression cannot contain assignment, perhaps you meant \"==\"?") } +invalid_named_expression: + | a=expression ':=' expression { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + a, "cannot use assignment expressions with %s", _PyPegen_get_expr_name(a)) } +invalid_assignment: + | a=invalid_ann_assign_target ':' expression { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + a, + "only single target (not %s) can be annotated", + _PyPegen_get_expr_name(a) + )} + | a=star_named_expression ',' star_named_expressions* ':' expression { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "only single target (not tuple) can be annotated") } + | a=expression ':' expression { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "illegal target for annotation") } + | (star_targets '=')* a=star_expressions '=' { + RAISE_SYNTAX_ERROR_INVALID_TARGET(STAR_TARGETS, a) } + | (star_targets '=')* a=yield_expr '=' { RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "assignment to yield expression not possible") } + | a=star_expressions augassign (yield_expr | star_expressions) { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION( + a, + "'%s' is an illegal expression for augmented assignment", + _PyPegen_get_expr_name(a) + )} +invalid_ann_assign_target[expr_ty]: + | list + | tuple + | '(' a=invalid_ann_assign_target ')' { a } +invalid_del_stmt: + | 'del' a=star_expressions { + RAISE_SYNTAX_ERROR_INVALID_TARGET(DEL_TARGETS, a) } +invalid_block: + | NEWLINE !INDENT { RAISE_INDENTATION_ERROR("expected an indented block") } +invalid_primary: + | primary a='{' { RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "invalid syntax") } +invalid_comprehension: + | ('[' | '(' | '{') a=starred_expression for_if_clauses { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "iterable unpacking cannot be used in comprehension") } +invalid_dict_comprehension: + | '{' a='**' bitwise_or for_if_clauses '}' { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "dict unpacking cannot be used in dict comprehension") } +invalid_parameters: + | param_no_default* (slash_with_default | param_with_default+) param_no_default { + RAISE_SYNTAX_ERROR("non-default argument follows default argument") } +invalid_lambda_parameters: + | lambda_param_no_default* (lambda_slash_with_default | lambda_param_with_default+) lambda_param_no_default { + RAISE_SYNTAX_ERROR("non-default argument follows default argument") } +invalid_star_etc: + | '*' (')' | ',' (')' | '**')) { RAISE_SYNTAX_ERROR("named arguments must follow bare *") } + | '*' ',' TYPE_COMMENT { RAISE_SYNTAX_ERROR("bare * has associated type comment") } +invalid_lambda_star_etc: + | '*' (':' | ',' (':' | '**')) { RAISE_SYNTAX_ERROR("named arguments must follow bare *") } +invalid_double_type_comments: + | TYPE_COMMENT NEWLINE TYPE_COMMENT NEWLINE INDENT { + RAISE_SYNTAX_ERROR("Cannot have two type comments on def") } +invalid_with_item: + | expression 'as' a=expression { + RAISE_SYNTAX_ERROR_INVALID_TARGET(STAR_TARGETS, a) } + +invalid_for_target: + | ASYNC? 'for' a=star_expressions { + RAISE_SYNTAX_ERROR_INVALID_TARGET(FOR_TARGETS, a) } + +invalid_group: + | '(' a=starred_expression ')' { + RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, "can't use starred expression here") } +invalid_import_from_targets: + | import_from_as_names ',' { + RAISE_SYNTAX_ERROR("trailing comma not allowed without surrounding parentheses") } \ No newline at end of file diff --git a/native/libcst/README.md b/native/libcst/README.md new file mode 100644 index 000000000..f33563b2e --- /dev/null +++ b/native/libcst/README.md @@ -0,0 +1,66 @@ +# libcst_native + +A very experimental native extension to speed up LibCST. This does not currently provide +much performance benefit and is therefore not recommended for general use. + +The extension is written in Rust using [PyO3](https://pyo3.rs/). + +This installs as a separate python package that LibCST looks for and will import if it's +available. + + +## Using with LibCST + +[Set up a rust development environment](https://www.rust-lang.org/tools/install). Using +`rustup` is recommended, but not necessary. Rust 1.45.0+ should work. + +Follow the instructions for setting up a virtualenv in the top-level README, then: + +``` +cd libcst_native +maturin develop # install libcst_native to the virtualenv +cd .. # cd back into the main project +python -m unittest +``` + +This will run the python test suite. Nothing special is required to use `libcst_native`, +since `libcst` will automatically use the native extension when it's installed. + +When benchmarking this code, make sure to run `maturin develop` with the `--release` +flag to enable compiler optimizations. + +You can disable the native extension by uninstalling the package from your virtualenv: + +``` +pip uninstall libcst_native +``` + + +## Rust Tests + +In addition to running the python test suite, you can run some tests written in rust +with + +``` +cargo test --no-default-features +``` + +The `--no-default-features` flag needed to work around an incompatibility between tests +and pyo3's `extension-module` feature. + + +## Code Formatting + +Use `cargo fmt` to format your code. + + +## Release + +This isn't currently supported, so there's no releases available, but the end-goal would +be to publish this on PyPI. + +Because this is a native extension, it must be re-built for each platform/architecture. +The per-platform build could be automated using a CI system, [like github +actions][gh-actions]. + +[gh-actions]: https://github.com/PyO3/maturin/blob/master/.github/workflows/release.yml diff --git a/native/libcst/benches/parser_benchmark.rs b/native/libcst/benches/parser_benchmark.rs new file mode 100644 index 000000000..92f518839 --- /dev/null +++ b/native/libcst/benches/parser_benchmark.rs @@ -0,0 +1,105 @@ +use std::{ + path::{Component, PathBuf}, + time::Duration, +}; + +use criterion::{ + black_box, criterion_group, criterion_main, measurement::Measurement, BatchSize, Criterion, +}; +use criterion_cycles_per_byte::CyclesPerByte; +use itertools::Itertools; +use libcst_native::{ + parse_module, parse_tokens_without_whitespace, tokenize, Codegen, Config, Inflate, +}; + +fn load_all_fixtures() -> String { + let mut path = PathBuf::from(file!()); + path.pop(); + path.pop(); + path = path + .components() + .skip(1) + .chain( + vec!["tests".as_ref(), "fixtures".as_ref()] + .into_iter() + .map(Component::Normal), + ) + .collect(); + + path.read_dir() + .expect("read_dir") + .into_iter() + .map(|file| { + let path = file.unwrap().path(); + std::fs::read_to_string(&path).expect("reading_file") + }) + .join("\n") +} + +pub fn inflate_benchmarks(c: &mut Criterion) { + let fixture = load_all_fixtures(); + let tokens = tokenize(fixture.as_str()).expect("tokenize failed"); + let mut group = c.benchmark_group("inflate"); + group.bench_function("all", |b| { + b.iter_batched( + || { + let conf = Config::new(fixture.as_str(), &tokens); + let m = parse_tokens_without_whitespace(tokens.clone(), fixture.as_str(), None) + .expect("parse failed"); + (conf, m) + }, + |(conf, m)| black_box(m.inflate(&conf)), + BatchSize::SmallInput, + ) + }); + group.finish(); +} + +pub fn parser_benchmarks(c: &mut Criterion) { + let fixture = load_all_fixtures(); + let mut group = c.benchmark_group("parse"); + group.measurement_time(Duration::from_secs(15)); + group.bench_function("all", |b| { + b.iter_batched( + || tokenize(fixture.as_str()).expect("tokenize failed"), + |tokens| { + black_box(parse_tokens_without_whitespace( + tokens, + fixture.as_str(), + None, + )) + }, + BatchSize::SmallInput, + ) + }); + group.finish(); +} + +pub fn codegen_benchmarks(c: &mut Criterion) { + let input = load_all_fixtures(); + let m = parse_module(&input, None).expect("parse failed"); + let mut group = c.benchmark_group("codegen"); + group.bench_function("all", |b| { + b.iter(|| { + let mut state = Default::default(); + #[allow(clippy::unit_arg)] + black_box(m.codegen(&mut state)); + }) + }); + group.finish(); +} + +pub fn tokenize_benchmarks(c: &mut Criterion) { + let input = load_all_fixtures(); + let mut group = c.benchmark_group("tokenize"); + group.measurement_time(Duration::from_secs(15)); + group.bench_function("all", |b| b.iter(|| black_box(tokenize(input.as_str())))); + group.finish(); +} + +criterion_group!( + name=benches; + config = Criterion::default().with_measurement(CyclesPerByte); + targets=parser_benchmarks, codegen_benchmarks, inflate_benchmarks, tokenize_benchmarks +); +criterion_main!(benches); diff --git a/native/libcst/src/bin.rs b/native/libcst/src/bin.rs new file mode 100644 index 000000000..234b90178 --- /dev/null +++ b/native/libcst/src/bin.rs @@ -0,0 +1,28 @@ +use libcst_native::*; +use std::{ + env, + io::{self, Read}, + process::exit, +}; + +pub fn main() { + let mut str = std::string::String::new(); + io::stdin().read_to_string(&mut str).unwrap(); + match parse_module(str.as_ref(), None) { + Err(e) => { + eprintln!("{}", prettify_error(e, "stdin")); + exit(1); + } + Ok(m) => { + let first_arg = env::args().nth(1).unwrap_or_else(|| "".to_string()); + if first_arg == "-d" { + println!("{:#?}", m); + } + if first_arg != "-n" { + let mut state = Default::default(); + m.codegen(&mut state); + print!("{}", state.to_string()); + } + } + }; +} diff --git a/native/libcst/src/lib.rs b/native/libcst/src/lib.rs new file mode 100644 index 000000000..6c809768f --- /dev/null +++ b/native/libcst/src/lib.rs @@ -0,0 +1,167 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::cmp::{max, min}; + +mod tokenizer; + +pub use tokenizer::whitespace_parser::Config; +use tokenizer::{whitespace_parser, TokConfig, Token, TokenIterator}; + +mod nodes; +pub use nodes::*; + +mod parser; +use parser::{ParserError, Result}; + +pub mod py; + +pub fn tokenize(text: &str) -> Result> { + let iter = TokenIterator::new( + text, + &TokConfig { + async_hacks: false, + split_fstring: true, + }, + ); + + iter.collect::, _>>() + .map_err(|err| ParserError::TokenizerError(err, text)) +} + +pub fn parse_tokens_without_whitespace<'a>( + tokens: Vec>, + module_text: &'a str, + encoding: Option<&str>, +) -> Result<'a, Module<'a>> { + parser::python::file(&tokens.into(), module_text, encoding) + .map_err(|err| ParserError::ParserError(err, module_text)) +} + +pub fn parse_module<'a>( + mut module_text: &'a str, + encoding: Option<&str>, +) -> Result<'a, Module<'a>> { + // Strip UTF-8 BOM + if let Some(stripped) = module_text.strip_prefix('\u{feff}') { + module_text = stripped; + } + let tokens = tokenize(module_text)?; + let conf = whitespace_parser::Config::new(module_text, &tokens); + let m = parse_tokens_without_whitespace(tokens, module_text, encoding)?; + Ok(m.inflate(&conf)?) +} + +pub fn parse_statement(text: &str) -> Result { + let tokens = tokenize(text)?; + let conf = whitespace_parser::Config::new(text, &tokens); + let stm = parser::python::statement_input(&tokens.into(), text) + .map_err(|err| ParserError::ParserError(err, text))?; + Ok(stm.inflate(&conf)?) +} + +pub fn parse_expression(text: &str) -> Result { + let tokens = tokenize(text)?; + let conf = whitespace_parser::Config::new(text, &tokens); + let expr = parser::python::expression_input(&tokens.into(), text) + .map_err(|err| ParserError::ParserError(err, text))?; + Ok(expr.inflate(&conf)?) +} + +// n starts from 1 +fn bol_offset(source: &str, n: i32) -> usize { + if n <= 1 { + return 0; + } + source + .match_indices('\n') + .nth((n - 2) as usize) + .map(|(index, _)| index + 1) + .unwrap_or_else(|| source.len()) +} + +pub fn prettify_error(err: ParserError, label: &str) -> std::string::String { + match err { + ParserError::ParserError(e, module_text) => { + let loc = e.location; + let context = 1; + let start_offset = bol_offset(module_text, loc.start_pos.line as i32 - context); + let end_offset = bol_offset(module_text, loc.end_pos.line as i32 + context + 1); + let source = &module_text[start_offset..end_offset]; + let start = loc.start_pos.offset - start_offset; + let end = loc.end_pos.offset - start_offset; + chic::Error::new(label) + .error( + max( + 1, + loc.start_pos + .line + .checked_sub(context as usize) + .unwrap_or(1), + ), + start, + if start == end { + min(end + 1, end_offset - start_offset + 1) + } else { + end + }, + source, + format!( + "expected {} {} -> {}", + e.expected, loc.start_pos, loc.end_pos + ), + ) + .to_string() + } + e => format!("Parse error for {}: {}", label, e), + } +} + +#[cfg(test)] +mod test { + use super::*; + use tokenizer::TokError; + + #[test] + fn test_simple() { + let n = parse_module("1_", None); + assert_eq!( + n.err().unwrap(), + ParserError::TokenizerError(TokError::BadDecimal, "1_") + ); + } + + #[test] + fn test_bare_minimum_funcdef() { + parse_module("def f(): ...", None).expect("parse error"); + } + + #[test] + fn test_funcdef_params() { + parse_module("def g(a, b): ...", None).expect("parse error"); + } + + #[test] + fn bol_offset_first_line() { + assert_eq!(0, bol_offset("hello", 1)); + assert_eq!(0, bol_offset("hello", 0)); + assert_eq!(0, bol_offset("hello\nhello", 1)); + assert_eq!(0, bol_offset("hello\nhello", 0)); + } + + #[test] + fn bol_offset_second_line() { + assert_eq!(5, bol_offset("hello", 2)); + assert_eq!(6, bol_offset("hello\nhello", 2)); + assert_eq!(6, bol_offset("hello\nhello\nhello", 2)); + } + + #[test] + fn bol_offset_last_line() { + assert_eq!(5, bol_offset("hello", 3)); + assert_eq!(11, bol_offset("hello\nhello", 3)); + assert_eq!(12, bol_offset("hello\nhello\nhello", 3)); + } +} diff --git a/native/libcst/src/nodes/codegen.rs b/native/libcst/src/nodes/codegen.rs new file mode 100644 index 000000000..3b4f3e7a7 --- /dev/null +++ b/native/libcst/src/nodes/codegen.rs @@ -0,0 +1,65 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::fmt; +#[derive(Debug)] +pub struct CodegenState<'a> { + pub tokens: String, + pub indent_tokens: Vec<&'a str>, + pub default_newline: &'a str, + pub default_indent: &'a str, +} + +impl<'a> CodegenState<'a> { + pub fn indent(&mut self, v: &'a str) { + self.indent_tokens.push(v); + } + pub fn dedent(&mut self) { + self.indent_tokens.pop(); + } + pub fn add_indent(&mut self) { + self.tokens.extend(self.indent_tokens.iter().cloned()); + } + pub fn add_token(&mut self, tok: &'a str) { + self.tokens.push_str(tok); + } +} + +impl<'a> fmt::Display for CodegenState<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.tokens) + } +} + +pub trait Codegen<'a> { + fn codegen(&self, state: &mut CodegenState<'a>); +} + +impl<'a, T> Codegen<'a> for Option +where + T: Codegen<'a>, +{ + fn codegen(&self, state: &mut CodegenState<'a>) { + if let Some(s) = &self { + s.codegen(state); + } + } +} + +#[cfg(windows)] +const LINE_ENDING: &str = "\r\n"; +#[cfg(not(windows))] +const LINE_ENDING: &str = "\n"; + +impl<'a> Default for CodegenState<'a> { + fn default() -> Self { + Self { + default_newline: LINE_ENDING, + default_indent: " ", + indent_tokens: Default::default(), + tokens: Default::default(), + } + } +} diff --git a/native/libcst/src/nodes/expression.rs b/native/libcst/src/nodes/expression.rs new file mode 100644 index 000000000..cb70d5722 --- /dev/null +++ b/native/libcst/src/nodes/expression.rs @@ -0,0 +1,2234 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::{mem::swap, rc::Rc}; + +use crate::{ + inflate_helpers::adjust_parameters_trailing_whitespace, + nodes::{ + traits::{Inflate, ParenthesizedNode, Result, WithComma}, + whitespace::ParenthesizableWhitespace, + Annotation, AssignEqual, AssignTargetExpression, BinaryOp, BooleanOp, Codegen, + CodegenState, Colon, Comma, CompOp, Dot, UnaryOp, + }, + tokenizer::{ + whitespace_parser::{parse_parenthesizable_whitespace, Config}, + Token, + }, +}; +use libcst_derive::{Codegen, Inflate, IntoPy, ParenthesizedNode}; +use pyo3::{types::PyModule, IntoPy}; + +type TokenRef<'a> = Rc>; + +#[derive(Debug, Eq, PartialEq, Default, Clone, IntoPy)] +pub struct Parameters<'a> { + pub params: Vec>, + pub star_arg: Option>, + pub kwonly_params: Vec>, + pub star_kwarg: Option>, + pub posonly_params: Vec>, + pub posonly_ind: Option>, +} + +impl<'a> Parameters<'a> { + pub fn is_empty(&self) -> bool { + self.params.is_empty() + && self.star_arg.is_none() + && self.kwonly_params.is_empty() + && self.star_kwarg.is_none() + && self.posonly_params.is_empty() + && self.posonly_ind.is_none() + } +} + +impl<'a> Inflate<'a> for Parameters<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.posonly_params = self.posonly_params.inflate(config)?; + self.posonly_ind = self.posonly_ind.inflate(config)?; + self.params = self.params.inflate(config)?; + self.star_arg = self.star_arg.inflate(config)?; + self.kwonly_params = self.kwonly_params.inflate(config)?; + self.star_kwarg = self.star_kwarg.inflate(config)?; + Ok(self) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, Inflate, IntoPy)] +pub enum StarArg<'a> { + Star(ParamStar<'a>), + Param(Box>), +} + +impl<'a> Codegen<'a> for Parameters<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + let params_after_kwonly = self.star_kwarg.is_some(); + let params_after_regular = !self.kwonly_params.is_empty() || params_after_kwonly; + let params_after_posonly = !self.params.is_empty() || params_after_regular; + let star_included = self.star_arg.is_some() || !self.kwonly_params.is_empty(); + + for p in &self.posonly_params { + p.codegen(state, None, true); + } + + match &self.posonly_ind { + Some(ind) => ind.codegen(state, params_after_posonly), + _ => { + if !self.posonly_params.is_empty() { + if params_after_posonly { + state.add_token("/, "); + } else { + state.add_token("/"); + } + } + } + } + + let param_size = self.params.len(); + for (i, p) in self.params.iter().enumerate() { + p.codegen(state, None, params_after_regular || i < param_size - 1); + } + + let kwonly_size = self.kwonly_params.len(); + match &self.star_arg { + None => { + if star_included { + state.add_token("*, ") + } + } + Some(StarArg::Param(p)) => p.codegen( + state, + Some("*"), + kwonly_size > 0 || self.star_kwarg.is_some(), + ), + Some(StarArg::Star(s)) => s.codegen(state), + } + + for (i, p) in self.kwonly_params.iter().enumerate() { + p.codegen(state, None, params_after_kwonly || i < kwonly_size - 1); + } + + if let Some(star) = &self.star_kwarg { + star.codegen(state, Some("**"), false) + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct ParamSlash<'a> { + pub comma: Option>, +} + +impl<'a> ParamSlash<'a> { + fn codegen(&self, state: &mut CodegenState<'a>, default_comma: bool) { + state.add_token("/"); + match (&self.comma, default_comma) { + (Some(comma), _) => comma.codegen(state), + (None, true) => state.add_token(", "), + _ => {} + } + } +} + +impl<'a> Inflate<'a> for ParamSlash<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.comma = self.comma.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct ParamStar<'a> { + pub comma: Comma<'a>, +} + +impl<'a> Codegen<'a> for ParamStar<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("*"); + self.comma.codegen(state); + } +} + +impl<'a> Inflate<'a> for ParamStar<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.comma = self.comma.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, Eq, PartialEq, Default, Clone, ParenthesizedNode, IntoPy)] +pub struct Name<'a> { + pub value: &'a str, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for Name<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Name<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token(self.value); + }); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Param<'a> { + pub name: Name<'a>, + pub annotation: Option>, + pub equal: Option>, + pub default: Option>, + + pub comma: Option>, + + pub star: Option<&'a str>, + + pub whitespace_after_star: ParenthesizableWhitespace<'a>, + pub whitespace_after_param: ParenthesizableWhitespace<'a>, + + pub(crate) star_tok: Option>, +} + +impl<'a> Inflate<'a> for Param<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + // TODO: whitespace_after_param missing? + self.name = self.name.inflate(config)?; + self.annotation = self.annotation.inflate(config)?; + self.equal = self.equal.inflate(config)?; + self.default = self.default.inflate(config)?; + self.comma = self.comma.inflate(config)?; + if let Some(star_tok) = self.star_tok.as_mut() { + self.whitespace_after_star = parse_parenthesizable_whitespace( + config, + &mut star_tok.whitespace_after.borrow_mut(), + )?; + } + Ok(self) + } +} + +impl<'a> Default for Param<'a> { + fn default() -> Self { + Self { + name: Default::default(), + annotation: None, + equal: None, + default: None, + comma: None, + star: Some(""), // Note: this preserves a quirk of the pure python parser + whitespace_after_param: Default::default(), + whitespace_after_star: Default::default(), + star_tok: None, + } + } +} + +impl<'a> Param<'a> { + fn codegen( + &self, + state: &mut CodegenState<'a>, + default_star: Option<&'a str>, + default_comma: bool, + ) { + match (self.star, default_star) { + (Some(star), _) => state.add_token(star), + (None, Some(star)) => state.add_token(star), + _ => {} + } + self.whitespace_after_star.codegen(state); + self.name.codegen(state); + + if let Some(ann) = &self.annotation { + ann.codegen(state, ":"); + } + + match (&self.equal, &self.default) { + (Some(equal), Some(def)) => { + equal.codegen(state); + def.codegen(state); + } + (None, Some(def)) => { + state.add_token(" = "); + def.codegen(state); + } + _ => {} + } + + match &self.comma { + Some(comma) => comma.codegen(state), + None if default_comma => state.add_token(", "), + _ => {} + } + + self.whitespace_after_param.codegen(state); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Arg<'a> { + pub value: Expression<'a>, + pub keyword: Option>, + pub equal: Option>, + pub comma: Option>, + pub star: &'a str, + pub whitespace_after_star: ParenthesizableWhitespace<'a>, + pub whitespace_after_arg: ParenthesizableWhitespace<'a>, + + pub(crate) star_tok: Option>, +} + +impl<'a> Inflate<'a> for Arg<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + if let Some(star_tok) = self.star_tok.as_mut() { + self.whitespace_after_star = parse_parenthesizable_whitespace( + config, + &mut star_tok.whitespace_after.borrow_mut(), + )?; + } + self.keyword = self.keyword.inflate(config)?; + self.equal = self.equal.inflate(config)?; + self.value = self.value.inflate(config)?; + self.comma = self.comma.inflate(config)?; + // whitespace_after_arg is handled in Call + Ok(self) + } +} + +impl<'a> Arg<'a> { + pub fn codegen(&self, state: &mut CodegenState<'a>, default_comma: bool) { + state.add_token(self.star); + self.whitespace_after_star.codegen(state); + if let Some(kw) = &self.keyword { + kw.codegen(state); + } + if let Some(eq) = &self.equal { + eq.codegen(state); + } else if self.keyword.is_some() { + state.add_token(" = "); + } + self.value.codegen(state); + + if let Some(comma) = &self.comma { + comma.codegen(state); + } else if default_comma { + state.add_token(", "); + } + + self.whitespace_after_arg.codegen(state); + } +} + +impl<'a> WithComma<'a> for Arg<'a> { + fn with_comma(self, c: Comma<'a>) -> Self { + Self { + comma: Some(c), + ..self + } + } +} + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub struct LeftParen<'a> { + /// Any space that appears directly after this left parenthesis. + pub whitespace_after: ParenthesizableWhitespace<'a>, + + pub(crate) lpar_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for LeftParen<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("("); + self.whitespace_after.codegen(state); + } +} + +impl<'a> Inflate<'a> for LeftParen<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*self.lpar_tok).whitespace_after.borrow_mut(), + )?; + Ok(self) + } +} + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub struct RightParen<'a> { + /// Any space that appears directly before this right parenthesis. + pub whitespace_before: ParenthesizableWhitespace<'a>, + + pub(crate) rpar_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for RightParen<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token(")"); + } +} + +impl<'a> Inflate<'a> for RightParen<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.rpar_tok).whitespace_before.borrow_mut(), + )?; + Ok(self) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, Eq, PartialEq, Clone, ParenthesizedNode, Codegen, Inflate, IntoPy)] +pub enum Expression<'a> { + Name(Name<'a>), + Ellipsis(Ellipsis<'a>), + Integer(Integer<'a>), + Float(Float<'a>), + Imaginary(Imaginary<'a>), + Comparison(Comparison<'a>), + UnaryOperation(UnaryOperation<'a>), + BinaryOperation(BinaryOperation<'a>), + BooleanOperation(BooleanOperation<'a>), + Attribute(Attribute<'a>), + Tuple(Tuple<'a>), + Call(Call<'a>), + GeneratorExp(GeneratorExp<'a>), + ListComp(ListComp<'a>), + SetComp(SetComp<'a>), + DictComp(DictComp<'a>), + List(List<'a>), + Set(Set<'a>), + Dict(Dict<'a>), + Subscript(Subscript<'a>), + StarredElement(StarredElement<'a>), + IfExp(IfExp<'a>), + Lambda(Lambda<'a>), + Yield(Yield<'a>), + Await(Await<'a>), + SimpleString(SimpleString<'a>), + ConcatenatedString(ConcatenatedString<'a>), + FormattedString(FormattedString<'a>), + NamedExpr(NamedExpr<'a>), +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Ellipsis<'a> { + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for Ellipsis<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token("..."); + }) + } +} +impl<'a> Inflate<'a> for Ellipsis<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Integer<'a> { + /// A string representation of the integer, such as ``"100000"`` or + /// ``"100_000"``. + pub value: &'a str, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for Integer<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token(self.value); + }) + } +} + +impl<'a> Inflate<'a> for Integer<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Float<'a> { + /// A string representation of the floating point number, such as ```"0.05"``, + /// ``".050"``, or ``"5e-2"``. + pub value: &'a str, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for Float<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token(self.value); + }) + } +} + +impl<'a> Inflate<'a> for Float<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Imaginary<'a> { + /// A string representation of the complex number, such as ``"2j"`` + pub value: &'a str, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for Imaginary<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token(self.value); + }) + } +} + +impl<'a> Inflate<'a> for Imaginary<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Comparison<'a> { + pub left: Box>, + pub comparisons: Vec>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for Comparison<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.left.codegen(state); + for comp in &self.comparisons { + comp.codegen(state); + } + }) + } +} +impl<'a> Inflate<'a> for Comparison<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.left = self.left.inflate(config)?; + self.comparisons = self.comparisons.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct UnaryOperation<'a> { + pub operator: UnaryOp<'a>, + pub expression: Box>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for UnaryOperation<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.operator.codegen(state); + self.expression.codegen(state); + }) + } +} + +impl<'a> Inflate<'a> for UnaryOperation<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.operator = self.operator.inflate(config)?; + self.expression = self.expression.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct BinaryOperation<'a> { + pub left: Box>, + pub operator: BinaryOp<'a>, + pub right: Box>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for BinaryOperation<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.left.codegen(state); + self.operator.codegen(state); + self.right.codegen(state); + }) + } +} + +impl<'a> Inflate<'a> for BinaryOperation<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.left = self.left.inflate(config)?; + self.operator = self.operator.inflate(config)?; + self.right = self.right.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct BooleanOperation<'a> { + pub left: Box>, + pub operator: BooleanOp<'a>, + pub right: Box>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for BooleanOperation<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.left.codegen(state); + self.operator.codegen(state); + self.right.codegen(state); + }) + } +} + +impl<'a> Inflate<'a> for BooleanOperation<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.left = self.left.inflate(config)?; + self.operator = self.operator.inflate(config)?; + self.right = self.right.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Call<'a> { + pub func: Box>, + pub args: Vec>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_after_func: ParenthesizableWhitespace<'a>, + pub whitespace_before_args: ParenthesizableWhitespace<'a>, + + pub(crate) lpar_tok: TokenRef<'a>, + pub(crate) rpar_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Call<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.func = self.func.inflate(config)?; + self.whitespace_after_func = parse_parenthesizable_whitespace( + config, + &mut (*self.lpar_tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_before_args = parse_parenthesizable_whitespace( + config, + &mut (*self.lpar_tok).whitespace_after.borrow_mut(), + )?; + self.args = self.args.inflate(config)?; + + if let Some(arg) = self.args.last_mut() { + if arg.comma.is_none() { + arg.whitespace_after_arg = parse_parenthesizable_whitespace( + config, + &mut (*self.rpar_tok).whitespace_before.borrow_mut(), + )?; + } + } + self.rpar = self.rpar.inflate(config)?; + + Ok(self) + } +} + +impl<'a> Codegen<'a> for Call<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.func.codegen(state); + self.whitespace_after_func.codegen(state); + state.add_token("("); + self.whitespace_before_args.codegen(state); + let arg_len = self.args.len(); + for (i, arg) in self.args.iter().enumerate() { + arg.codegen(state, i + 1 < arg_len); + } + state.add_token(")"); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Attribute<'a> { + pub value: Box>, + pub attr: Name<'a>, + pub dot: Dot<'a>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for Attribute<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.value = self.value.inflate(config)?; + self.dot = self.dot.inflate(config)?; + self.attr = self.attr.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Attribute<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.value.codegen(state); + self.dot.codegen(state); + self.attr.codegen(state); + }) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, Codegen, Inflate, IntoPy)] +pub enum NameOrAttribute<'a> { + N(Name<'a>), + A(Attribute<'a>), +} + +impl<'a> std::convert::From> for Expression<'a> { + fn from(x: NameOrAttribute<'a>) -> Self { + match x { + NameOrAttribute::N(n) => Self::Name(n), + NameOrAttribute::A(a) => Self::Attribute(a), + } + } +} + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub struct ComparisonTarget<'a> { + pub operator: CompOp<'a>, + pub comparator: Expression<'a>, +} + +impl<'a> Codegen<'a> for ComparisonTarget<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.operator.codegen(state); + self.comparator.codegen(state); + } +} + +impl<'a> Inflate<'a> for ComparisonTarget<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.operator = self.operator.inflate(config)?; + self.comparator = self.comparator.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct StarredElement<'a> { + pub value: Box>, + pub comma: Option>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_before_value: ParenthesizableWhitespace<'a>, + + pub(crate) star_tok: TokenRef<'a>, +} + +impl<'a> StarredElement<'a> { + pub fn inflate_element(mut self, config: &Config<'a>, is_last: bool) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.whitespace_before_value = parse_parenthesizable_whitespace( + config, + &mut (*self.star_tok).whitespace_after.borrow_mut(), + )?; + self.value = self.value.inflate(config)?; + self.comma = if is_last { + self.comma.map(|c| c.inflate_before(config)).transpose() + } else { + self.comma.inflate(config) + }?; + Ok(self) + } +} + +impl<'a> Inflate<'a> for StarredElement<'a> { + fn inflate(self, config: &Config<'a>) -> Result { + self.inflate_element(config, false) + } +} + +impl<'a> Codegen<'a> for StarredElement<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token("*"); + self.whitespace_before_value.codegen(state); + self.value.codegen(state); + }); + if let Some(comma) = &self.comma { + comma.codegen(state); + } + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum Element<'a> { + Simple { + value: Expression<'a>, + comma: Option>, + }, + Starred(StarredElement<'a>), +} + +// TODO: this could be a derive helper attribute to override the python class name +impl<'a> IntoPy for Element<'a> { + fn into_py(self, py: pyo3::Python) -> pyo3::PyObject { + match self { + Self::Starred(s) => s.into_py(py), + Self::Simple { value, comma } => { + let libcst = PyModule::import(py, "libcst").expect("libcst cannot be imported"); + let kwargs = [ + Some(("value", value.into_py(py))), + comma.map(|x| ("comma", x.into_py(py))), + ] + .iter() + .filter(|x| x.is_some()) + .map(|x| x.as_ref().unwrap()) + .collect::>() + .into_py_dict(py); + libcst + .getattr("Element") + .expect("no Element found in libcst") + .call((), Some(kwargs)) + .expect("conversion failed") + .into() + } + } + } +} + +impl<'a> Element<'a> { + fn codegen( + &self, + state: &mut CodegenState<'a>, + default_comma: bool, + default_comma_whitespace: bool, + ) { + match self { + Self::Simple { value, comma } => { + value.codegen(state); + if let Some(comma) = comma { + comma.codegen(state) + } + } + Self::Starred(s) => s.codegen(state), + } + let maybe_comma = match self { + Self::Simple { comma, .. } => comma, + Self::Starred(s) => &s.comma, + }; + if maybe_comma.is_none() && default_comma { + state.add_token(if default_comma_whitespace { ", " } else { "," }); + } + } + + pub fn inflate_element(self, config: &Config<'a>, is_last: bool) -> Result { + Ok(match self { + Self::Starred(s) => Self::Starred(s.inflate_element(config, is_last)?), + Self::Simple { value, comma } => Self::Simple { + value: value.inflate(config)?, + comma: if is_last { + comma.map(|c| c.inflate_before(config)).transpose()? + } else { + comma.inflate(config)? + }, + }, + }) + } +} + +impl<'a> WithComma<'a> for Element<'a> { + fn with_comma(self, comma: Comma<'a>) -> Self { + let comma = Some(comma); + match self { + Self::Simple { value, .. } => Self::Simple { comma, value }, + Self::Starred(s) => Self::Starred(StarredElement { comma, ..s }), + } + } +} +impl<'a> std::convert::From> for Element<'a> { + fn from(e: Expression<'a>) -> Self { + match e { + Expression::StarredElement(e) => Element::Starred(e), + value => Element::Simple { value, comma: None }, + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default, ParenthesizedNode, IntoPy)] +pub struct Tuple<'a> { + pub elements: Vec>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for Tuple<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result> { + self.lpar = self.lpar.inflate(config)?; + let len = self.elements.len(); + self.elements = self + .elements + .into_iter() + .enumerate() + .map(|(idx, el)| el.inflate_element(config, idx + 1 == len)) + .collect::>>()?; + if !self.elements.is_empty() { + // rpar only has whitespace if elements is non empty + self.rpar = self.rpar.inflate(config)?; + } + Ok(self) + } +} + +impl<'a> Codegen<'a> for Tuple<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + let len = self.elements.len(); + if len == 1 { + self.elements.first().unwrap().codegen(state, true, false); + } else { + for (idx, el) in self.elements.iter().enumerate() { + el.codegen(state, idx < len - 1, true); + } + } + }); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct GeneratorExp<'a> { + pub elt: Box>, + pub for_in: Box>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for GeneratorExp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.elt.codegen(state); + self.for_in.codegen(state); + }) + } +} + +impl<'a> Inflate<'a> for GeneratorExp<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.elt = self.elt.inflate(config)?; + self.for_in = self.for_in.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct ListComp<'a> { + pub elt: Box>, + pub for_in: Box>, + pub lbracket: LeftSquareBracket<'a>, + pub rbracket: RightSquareBracket<'a>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Codegen<'a> for ListComp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.lbracket.codegen(state); + self.elt.codegen(state); + self.for_in.codegen(state); + self.rbracket.codegen(state); + }) + } +} + +impl<'a> Inflate<'a> for ListComp<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.lbracket = self.lbracket.inflate(config)?; + self.elt = self.elt.inflate(config)?; + self.for_in = self.for_in.inflate(config)?; + self.rbracket = self.rbracket.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct LeftSquareBracket<'a> { + pub whitespace_after: ParenthesizableWhitespace<'a>, + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for LeftSquareBracket<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("["); + self.whitespace_after.codegen(state); + } +} + +impl<'a> Inflate<'a> for LeftSquareBracket<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_after.borrow_mut(), + )?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct RightSquareBracket<'a> { + pub whitespace_before: ParenthesizableWhitespace<'a>, + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for RightSquareBracket<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token("]"); + } +} + +impl<'a> Inflate<'a> for RightSquareBracket<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct SetComp<'a> { + pub elt: Box>, + pub for_in: Box>, + pub lbrace: LeftCurlyBrace<'a>, + pub rbrace: RightCurlyBrace<'a>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for SetComp<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.lbrace = self.lbrace.inflate(config)?; + self.elt = self.elt.inflate(config)?; + self.for_in = self.for_in.inflate(config)?; + self.rbrace = self.rbrace.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for SetComp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.lbrace.codegen(state); + self.elt.codegen(state); + self.for_in.codegen(state); + self.rbrace.codegen(state); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct DictComp<'a> { + pub key: Box>, + pub value: Box>, + pub for_in: Box>, + pub lbrace: LeftCurlyBrace<'a>, + pub rbrace: RightCurlyBrace<'a>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_before_colon: ParenthesizableWhitespace<'a>, + pub whitespace_after_colon: ParenthesizableWhitespace<'a>, + + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for DictComp<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.lbrace = self.lbrace.inflate(config)?; + self.key = self.key.inflate(config)?; + self.whitespace_before_colon = parse_parenthesizable_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after_colon = parse_parenthesizable_whitespace( + config, + &mut (*self.colon_tok).whitespace_after.borrow_mut(), + )?; + self.value = self.value.inflate(config)?; + self.for_in = self.for_in.inflate(config)?; + self.rbrace = self.rbrace.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for DictComp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.lbrace.codegen(state); + self.key.codegen(state); + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.whitespace_after_colon.codegen(state); + self.value.codegen(state); + self.for_in.codegen(state); + self.rbrace.codegen(state); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct LeftCurlyBrace<'a> { + pub whitespace_after: ParenthesizableWhitespace<'a>, + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for LeftCurlyBrace<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_after.borrow_mut(), + )?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for LeftCurlyBrace<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("{"); + self.whitespace_after.codegen(state); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct RightCurlyBrace<'a> { + pub whitespace_before: ParenthesizableWhitespace<'a>, + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for RightCurlyBrace<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for RightCurlyBrace<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token("}"); + } +} + +impl<'a> pyo3::conversion::IntoPy for Box> { + fn into_py(self, py: pyo3::Python) -> pyo3::PyObject { + (*self).into_py(py) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct CompFor<'a> { + pub target: AssignTargetExpression<'a>, + pub iter: Expression<'a>, + pub ifs: Vec>, + pub inner_for_in: Option>>, + pub asynchronous: Option>, + pub whitespace_before: ParenthesizableWhitespace<'a>, + pub whitespace_after_for: ParenthesizableWhitespace<'a>, + pub whitespace_before_in: ParenthesizableWhitespace<'a>, + pub whitespace_after_in: ParenthesizableWhitespace<'a>, + + pub(crate) async_tok: Option>, + pub(crate) for_tok: TokenRef<'a>, + pub(crate) in_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for CompFor<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + if let Some(asynchronous) = &self.asynchronous { + asynchronous.codegen(state); + } + state.add_token("for"); + self.whitespace_after_for.codegen(state); + self.target.codegen(state); + self.whitespace_before_in.codegen(state); + state.add_token("in"); + self.whitespace_after_in.codegen(state); + self.iter.codegen(state); + for if_ in &self.ifs { + if_.codegen(state); + } + if let Some(inner) = &self.inner_for_in { + inner.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for CompFor<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.for_tok).whitespace_before.borrow_mut(), + )?; + if let (Some(asy_tok), Some(asy)) = (self.async_tok.as_mut(), self.asynchronous.as_mut()) { + // If there is an async keyword, the start of the CompFor expression is + // considered to be this keyword, so whitespace_before needs to adjust but + // Asynchronous will own the whitespace before the for token. + asy.whitespace_after = parse_parenthesizable_whitespace( + config, + &mut asy_tok.whitespace_before.borrow_mut(), + )?; + swap(&mut asy.whitespace_after, &mut self.whitespace_before); + } + self.whitespace_after_for = parse_parenthesizable_whitespace( + config, + &mut (*self.for_tok).whitespace_after.borrow_mut(), + )?; + self.target = self.target.inflate(config)?; + self.whitespace_before_in = parse_parenthesizable_whitespace( + config, + &mut (*self.in_tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after_in = parse_parenthesizable_whitespace( + config, + &mut (*self.in_tok).whitespace_after.borrow_mut(), + )?; + self.iter = self.iter.inflate(config)?; + self.ifs = self.ifs.inflate(config)?; + self.inner_for_in = self.inner_for_in.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Asynchronous<'a> { + pub whitespace_after: ParenthesizableWhitespace<'a>, +} + +impl<'a> Codegen<'a> for Asynchronous<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("async"); + self.whitespace_after.codegen(state); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct CompIf<'a> { + pub test: Expression<'a>, + pub whitespace_before: ParenthesizableWhitespace<'a>, + pub whitespace_before_test: ParenthesizableWhitespace<'a>, + + pub(crate) if_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for CompIf<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token("if"); + self.whitespace_before_test.codegen(state); + self.test.codegen(state); + } +} + +impl<'a> Inflate<'a> for CompIf<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.if_tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_before_test = parse_parenthesizable_whitespace( + config, + &mut (*self.if_tok).whitespace_after.borrow_mut(), + )?; + self.test = self.test.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct List<'a> { + pub elements: Vec>, + pub lbracket: LeftSquareBracket<'a>, + pub rbracket: RightSquareBracket<'a>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for List<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.lbracket = self.lbracket.inflate(config)?; + let len = self.elements.len(); + self.elements = self + .elements + .into_iter() + .enumerate() + .map(|(idx, el)| el.inflate_element(config, idx + 1 == len)) + .collect::>()?; + if !self.elements.is_empty() { + // lbracket owns all the whitespace if there are no elements + self.rbracket = self.rbracket.inflate(config)?; + } + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for List<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.lbracket.codegen(state); + let len = self.elements.len(); + for (idx, el) in self.elements.iter().enumerate() { + el.codegen(state, idx < len - 1, true); + } + self.rbracket.codegen(state); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Set<'a> { + pub elements: Vec>, + pub lbrace: LeftCurlyBrace<'a>, + pub rbrace: RightCurlyBrace<'a>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for Set<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.lbrace = self.lbrace.inflate(config)?; + let len = self.elements.len(); + self.elements = self + .elements + .into_iter() + .enumerate() + .map(|(idx, el)| el.inflate_element(config, idx + 1 == len)) + .collect::>()?; + if !self.elements.is_empty() { + self.rbrace = self.rbrace.inflate(config)?; + } + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Set<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.lbrace.codegen(state); + let len = self.elements.len(); + for (idx, el) in self.elements.iter().enumerate() { + el.codegen(state, idx < len - 1, true); + } + self.rbrace.codegen(state); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Dict<'a> { + pub elements: Vec>, + pub lbrace: LeftCurlyBrace<'a>, + pub rbrace: RightCurlyBrace<'a>, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for Dict<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.lbrace = self.lbrace.inflate(config)?; + let len = self.elements.len(); + self.elements = self + .elements + .into_iter() + .enumerate() + .map(|(idx, el)| el.inflate_element(config, idx + 1 == len)) + .collect::>()?; + if !self.elements.is_empty() { + self.rbrace = self.rbrace.inflate(config)?; + } + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Dict<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.lbrace.codegen(state); + let len = self.elements.len(); + for (idx, el) in self.elements.iter().enumerate() { + el.codegen(state, idx < len - 1, true); + } + self.rbrace.codegen(state); + }) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum DictElement<'a> { + Simple { + key: Expression<'a>, + value: Expression<'a>, + comma: Option>, + whitespace_before_colon: ParenthesizableWhitespace<'a>, + whitespace_after_colon: ParenthesizableWhitespace<'a>, + colon_tok: TokenRef<'a>, + }, + Starred(StarredDictElement<'a>), +} + +// TODO: this could be a derive helper attribute to override the python class name +impl<'a> IntoPy for DictElement<'a> { + fn into_py(self, py: pyo3::Python) -> pyo3::PyObject { + match self { + Self::Starred(s) => s.into_py(py), + Self::Simple { + key, + value, + comma, + whitespace_after_colon, + whitespace_before_colon, + .. + } => { + let libcst = PyModule::import(py, "libcst").expect("libcst cannot be imported"); + let kwargs = [ + Some(("key", key.into_py(py))), + Some(("value", value.into_py(py))), + Some(( + "whitespace_before_colon", + whitespace_before_colon.into_py(py), + )), + Some(("whitespace_after_colon", whitespace_after_colon.into_py(py))), + comma.map(|x| ("comma", x.into_py(py))), + ] + .iter() + .filter(|x| x.is_some()) + .map(|x| x.as_ref().unwrap()) + .collect::>() + .into_py_dict(py); + libcst + .getattr("DictElement") + .expect("no Element found in libcst") + .call((), Some(kwargs)) + .expect("conversion failed") + .into() + } + } + } +} + +impl<'a> DictElement<'a> { + pub fn inflate_element(self, config: &Config<'a>, last_element: bool) -> Result { + Ok(match self { + Self::Starred(s) => Self::Starred(s.inflate_element(config, last_element)?), + Self::Simple { + key, + value, + comma, + colon_tok, + .. + } => { + let whitespace_before_colon = parse_parenthesizable_whitespace( + config, + &mut colon_tok.whitespace_before.borrow_mut(), + )?; + let whitespace_after_colon = parse_parenthesizable_whitespace( + config, + &mut colon_tok.whitespace_after.borrow_mut(), + )?; + Self::Simple { + key: key.inflate(config)?, + whitespace_before_colon, + whitespace_after_colon, + value: value.inflate(config)?, + comma: if last_element { + comma.map(|c| c.inflate_before(config)).transpose() + } else { + comma.inflate(config) + }?, + colon_tok, + } + } + }) + } +} + +impl<'a> DictElement<'a> { + fn codegen( + &self, + state: &mut CodegenState<'a>, + default_comma: bool, + default_comma_whitespace: bool, + ) { + match self { + Self::Simple { + key, + value, + comma, + whitespace_before_colon, + whitespace_after_colon, + .. + } => { + key.codegen(state); + whitespace_before_colon.codegen(state); + state.add_token(":"); + whitespace_after_colon.codegen(state); + value.codegen(state); + if let Some(comma) = comma { + comma.codegen(state) + } + } + Self::Starred(s) => s.codegen(state), + } + let maybe_comma = match self { + Self::Simple { comma, .. } => comma, + Self::Starred(s) => &s.comma, + }; + if maybe_comma.is_none() && default_comma { + state.add_token(if default_comma_whitespace { ", " } else { "," }); + } + } +} + +impl<'a> WithComma<'a> for DictElement<'a> { + fn with_comma(self, comma: Comma<'a>) -> Self { + let comma = Some(comma); + match self { + Self::Starred(s) => Self::Starred(StarredDictElement { comma, ..s }), + Self::Simple { + key, + value, + whitespace_before_colon, + whitespace_after_colon, + colon_tok, + .. + } => Self::Simple { + comma, + key, + value, + whitespace_after_colon, + whitespace_before_colon, + colon_tok, + }, + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct StarredDictElement<'a> { + pub value: Expression<'a>, + pub comma: Option>, + pub whitespace_before_value: ParenthesizableWhitespace<'a>, + + pub(crate) star_tok: TokenRef<'a>, +} + +impl<'a> StarredDictElement<'a> { + fn inflate_element(mut self, config: &Config<'a>, last_element: bool) -> Result { + self.whitespace_before_value = parse_parenthesizable_whitespace( + config, + &mut (*self.star_tok).whitespace_after.borrow_mut(), + )?; + self.value = self.value.inflate(config)?; + self.comma = if last_element { + self.comma.map(|c| c.inflate_before(config)).transpose() + } else { + self.comma.inflate(config) + }?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for StarredDictElement<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("**"); + self.whitespace_before_value.codegen(state); + self.value.codegen(state); + if let Some(comma) = &self.comma { + comma.codegen(state); + } + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, Codegen, Inflate, IntoPy)] +pub enum BaseSlice<'a> { + Index(Index<'a>), + Slice(Slice<'a>), +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Index<'a> { + pub value: Expression<'a>, +} + +impl<'a> Inflate<'a> for Index<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.value = self.value.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Index<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.value.codegen(state); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Slice<'a> { + #[no_py_default] + pub lower: Option>, + #[no_py_default] + pub upper: Option>, + pub step: Option>, + pub first_colon: Colon<'a>, + pub second_colon: Option>, +} + +impl<'a> Inflate<'a> for Slice<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lower = self.lower.inflate(config)?; + self.first_colon = self.first_colon.inflate(config)?; + self.upper = self.upper.inflate(config)?; + self.second_colon = self.second_colon.inflate(config)?; + self.step = self.step.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Slice<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + if let Some(lower) = &self.lower { + lower.codegen(state); + } + self.first_colon.codegen(state); + if let Some(upper) = &self.upper { + upper.codegen(state); + } + if let Some(second_colon) = &self.second_colon { + second_colon.codegen(state); + } else if self.step.is_some() { + state.add_token(";"); + } + if let Some(step) = &self.step { + step.codegen(state); + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct SubscriptElement<'a> { + pub slice: BaseSlice<'a>, + pub comma: Option>, +} + +impl<'a> Inflate<'a> for SubscriptElement<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.slice = self.slice.inflate(config)?; + self.comma = self.comma.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for SubscriptElement<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.slice.codegen(state); + if let Some(comma) = &self.comma { + comma.codegen(state); + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Subscript<'a> { + pub value: Box>, + pub slice: Vec>, + pub lbracket: LeftSquareBracket<'a>, + pub rbracket: RightSquareBracket<'a>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_after_value: ParenthesizableWhitespace<'a>, + + pub(crate) lbracket_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Subscript<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.value = self.value.inflate(config)?; + self.whitespace_after_value = parse_parenthesizable_whitespace( + config, + &mut (*self.lbracket_tok).whitespace_before.borrow_mut(), + )?; + self.lbracket = self.lbracket.inflate(config)?; + self.slice = self.slice.inflate(config)?; + self.rbracket = self.rbracket.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Subscript<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.value.codegen(state); + self.whitespace_after_value.codegen(state); + self.lbracket.codegen(state); + let len = self.slice.len(); + for (i, slice) in self.slice.iter().enumerate() { + slice.codegen(state); + if slice.comma.is_none() && i + 1 < len { + state.add_token(", ") + } + } + self.rbracket.codegen(state); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct IfExp<'a> { + pub test: Box>, + pub body: Box>, + pub orelse: Box>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_before_if: ParenthesizableWhitespace<'a>, + pub whitespace_after_if: ParenthesizableWhitespace<'a>, + pub whitespace_before_else: ParenthesizableWhitespace<'a>, + pub whitespace_after_else: ParenthesizableWhitespace<'a>, + + pub(crate) if_tok: TokenRef<'a>, + pub(crate) else_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for IfExp<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.body = self.body.inflate(config)?; + self.whitespace_before_if = parse_parenthesizable_whitespace( + config, + &mut (*self.if_tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after_if = parse_parenthesizable_whitespace( + config, + &mut (*self.if_tok).whitespace_after.borrow_mut(), + )?; + self.test = self.test.inflate(config)?; + self.whitespace_before_else = parse_parenthesizable_whitespace( + config, + &mut (*self.else_tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after_else = parse_parenthesizable_whitespace( + config, + &mut (*self.else_tok).whitespace_after.borrow_mut(), + )?; + self.orelse = self.orelse.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for IfExp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.body.codegen(state); + self.whitespace_before_if.codegen(state); + state.add_token("if"); + self.whitespace_after_if.codegen(state); + self.test.codegen(state); + self.whitespace_before_else.codegen(state); + state.add_token("else"); + self.whitespace_after_else.codegen(state); + self.orelse.codegen(state); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Lambda<'a> { + pub params: Box>, + pub body: Box>, + pub colon: Colon<'a>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_after_lambda: Option>, + + pub(crate) lambda_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Lambda<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + if !self.params.is_empty() { + self.whitespace_after_lambda = Some(parse_parenthesizable_whitespace( + config, + &mut (*self.lambda_tok).whitespace_after.borrow_mut(), + )?); + } + self.params = self.params.inflate(config)?; + adjust_parameters_trailing_whitespace(config, &mut self.params, &self.colon.tok)?; + self.colon = self.colon.inflate(config)?; + self.body = self.body.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Lambda<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token("lambda"); + if let Some(ws) = &self.whitespace_after_lambda { + ws.codegen(state); + } else if !self.params.is_empty() { + // there's one or more params, add a space + state.add_token(" ") + } + self.params.codegen(state); + self.colon.codegen(state); + self.body.codegen(state); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct From<'a> { + pub item: Expression<'a>, + pub whitespace_before_from: Option>, + pub whitespace_after_from: ParenthesizableWhitespace<'a>, + + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> From<'a> { + pub fn codegen(&self, state: &mut CodegenState<'a>, default_space: &'a str) { + if let Some(ws) = &self.whitespace_before_from { + ws.codegen(state); + } else { + state.add_token(default_space); + } + state.add_token("from"); + self.whitespace_after_from.codegen(state); + self.item.codegen(state); + } +} + +impl<'a> Inflate<'a> for From<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before_from = Some(parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?); + self.whitespace_after_from = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_after.borrow_mut(), + )?; + self.item = self.item.inflate(config)?; + Ok(self) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub enum YieldValue<'a> { + Expression(Expression<'a>), + From(From<'a>), +} + +impl<'a> Inflate<'a> for YieldValue<'a> { + fn inflate(self, config: &Config<'a>) -> Result { + Ok(match self { + Self::Expression(e) => Self::Expression(e.inflate(config)?), + Self::From(e) => { + let mut e = e.inflate(config)?; + e.whitespace_before_from = None; + Self::From(e) + } + }) + } +} + +impl<'a> YieldValue<'a> { + fn codegen(&self, state: &mut CodegenState<'a>, default_space: &'a str) { + match self { + Self::Expression(e) => e.codegen(state), + Self::From(f) => f.codegen(state, default_space), + } + } +} + +impl<'a> pyo3::conversion::IntoPy for Box> { + fn into_py(self, py: pyo3::Python) -> pyo3::PyObject { + (*self).into_py(py) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Yield<'a> { + pub value: Option>>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_after_yield: Option>, + + pub(crate) yield_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Yield<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + if self.value.is_some() { + self.whitespace_after_yield = Some(parse_parenthesizable_whitespace( + config, + &mut (*self.yield_tok).whitespace_after.borrow_mut(), + )?); + } + self.value = self.value.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Yield<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token("yield"); + if let Some(ws) = &self.whitespace_after_yield { + ws.codegen(state); + } else if self.value.is_some() { + state.add_token(" "); + } + + if let Some(val) = &self.value { + val.codegen(state, "") + } + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct Await<'a> { + pub expression: Box>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_after_await: ParenthesizableWhitespace<'a>, + + pub(crate) await_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Await<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.whitespace_after_await = parse_parenthesizable_whitespace( + config, + &mut (*self.await_tok).whitespace_after.borrow_mut(), + )?; + self.expression = self.expression.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Await<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token("await"); + self.whitespace_after_await.codegen(state); + self.expression.codegen(state); + }) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, Codegen, Inflate, IntoPy)] +pub enum String<'a> { + Simple(SimpleString<'a>), + Concatenated(ConcatenatedString<'a>), + Formatted(FormattedString<'a>), +} + +impl<'a> std::convert::From> for Expression<'a> { + fn from(s: String<'a>) -> Self { + match s { + String::Simple(s) => Self::SimpleString(s), + String::Concatenated(s) => Self::ConcatenatedString(s), + String::Formatted(s) => Self::FormattedString(s), + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct ConcatenatedString<'a> { + pub left: Box>, + pub right: Box>, + pub lpar: Vec>, + pub rpar: Vec>, + pub whitespace_between: ParenthesizableWhitespace<'a>, + + // we capture the next token after each string piece so Inflate can extract the + // whitespace between individual pieces + pub(crate) right_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for ConcatenatedString<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.left = self.left.inflate(config)?; + self.whitespace_between = parse_parenthesizable_whitespace( + config, + &mut (*self.right_tok).whitespace_before.borrow_mut(), + )?; + self.right = self.right.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for ConcatenatedString<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.left.codegen(state); + self.whitespace_between.codegen(state); + self.right.codegen(state); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Default, ParenthesizedNode, IntoPy)] +pub struct SimpleString<'a> { + /// The texual representation of the string, including quotes, prefix + /// characters, and any escape characters present in the original source code, + /// such as ``r"my string\n"``. + pub value: &'a str, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for SimpleString<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for SimpleString<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| state.add_token(self.value)) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct FormattedStringText<'a> { + pub value: &'a str, +} + +impl<'a> Inflate<'a> for FormattedStringText<'a> { + fn inflate(self, _config: &Config<'a>) -> Result { + Ok(self) + } +} + +impl<'a> Codegen<'a> for FormattedStringText<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token(self.value); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct FormattedStringExpression<'a> { + pub expression: Expression<'a>, + pub conversion: Option<&'a str>, + pub format_spec: Option>>, + pub whitespace_before_expression: ParenthesizableWhitespace<'a>, + pub whitespace_after_expression: ParenthesizableWhitespace<'a>, + pub equal: Option>, + + pub(crate) lbrace_tok: TokenRef<'a>, + // This is None if there's an equal sign, otherwise it's the first token of + // (conversion, format spec, right brace) in that order + pub(crate) after_expr_tok: Option>, +} + +impl<'a> Inflate<'a> for FormattedStringExpression<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before_expression = parse_parenthesizable_whitespace( + config, + &mut (*self.lbrace_tok).whitespace_after.borrow_mut(), + )?; + self.expression = self.expression.inflate(config)?; + self.equal = self.equal.inflate(config)?; + if let Some(after_expr_tok) = self.after_expr_tok.as_mut() { + self.whitespace_after_expression = parse_parenthesizable_whitespace( + config, + &mut after_expr_tok.whitespace_before.borrow_mut(), + )?; + } + self.format_spec = self.format_spec.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for FormattedStringExpression<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("{"); + self.whitespace_before_expression.codegen(state); + self.expression.codegen(state); + if let Some(eq) = &self.equal { + eq.codegen(state); + } + self.whitespace_after_expression.codegen(state); + if let Some(conv) = &self.conversion { + state.add_token("!"); + state.add_token(conv); + } + if let Some(specs) = &self.format_spec { + state.add_token(":"); + for spec in specs { + spec.codegen(state); + } + } + state.add_token("}"); + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, Codegen, Inflate, IntoPy)] +pub enum FormattedStringContent<'a> { + Text(FormattedStringText<'a>), + Expression(FormattedStringExpression<'a>), +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct FormattedString<'a> { + pub parts: Vec>, + pub start: &'a str, + pub end: &'a str, + pub lpar: Vec>, + pub rpar: Vec>, +} + +impl<'a> Inflate<'a> for FormattedString<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.parts = self.parts.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for FormattedString<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + state.add_token(self.start); + for part in &self.parts { + part.codegen(state); + } + state.add_token(self.end); + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, ParenthesizedNode, IntoPy)] +pub struct NamedExpr<'a> { + pub target: Box>, + pub value: Box>, + pub lpar: Vec>, + pub rpar: Vec>, + + pub whitespace_before_walrus: ParenthesizableWhitespace<'a>, + pub whitespace_after_walrus: ParenthesizableWhitespace<'a>, + + pub(crate) walrus_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for NamedExpr<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.parenthesize(state, |state| { + self.target.codegen(state); + self.whitespace_before_walrus.codegen(state); + state.add_token(":="); + self.whitespace_after_walrus.codegen(state); + self.value.codegen(state); + }) + } +} + +impl<'a> Inflate<'a> for NamedExpr<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.lpar = self.lpar.inflate(config)?; + self.target = self.target.inflate(config)?; + self.whitespace_before_walrus = parse_parenthesizable_whitespace( + config, + &mut self.walrus_tok.whitespace_before.borrow_mut(), + )?; + self.whitespace_after_walrus = parse_parenthesizable_whitespace( + config, + &mut self.walrus_tok.whitespace_after.borrow_mut(), + )?; + self.value = self.value.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + Ok(self) + } +} diff --git a/native/libcst/src/nodes/inflate_helpers.rs b/native/libcst/src/nodes/inflate_helpers.rs new file mode 100644 index 000000000..902ad0320 --- /dev/null +++ b/native/libcst/src/nodes/inflate_helpers.rs @@ -0,0 +1,34 @@ +use crate::{ + nodes::traits::Result, + tokenizer::{ + whitespace_parser::{parse_parenthesizable_whitespace, Config}, + Token, + }, + Param, Parameters, StarArg, +}; + +pub(crate) fn adjust_parameters_trailing_whitespace<'a>( + config: &Config<'a>, + parameters: &mut Parameters<'a>, + next_tok: &Token<'a>, +) -> Result<()> { + let do_adjust = |param: &mut Param<'a>| -> Result<()> { + let whitespace_after = + parse_parenthesizable_whitespace(config, &mut next_tok.whitespace_before.borrow_mut())?; + if param.comma.is_none() { + param.whitespace_after_param = whitespace_after; + } + Ok(()) + }; + + if let Some(param) = &mut parameters.star_kwarg { + do_adjust(param)?; + } else if let Some(param) = parameters.kwonly_params.last_mut() { + do_adjust(param)?; + } else if let Some(StarArg::Param(param)) = parameters.star_arg.as_mut() { + do_adjust(param)?; + } else if let Some(param) = parameters.params.last_mut() { + do_adjust(param)?; + } + Ok(()) +} diff --git a/native/libcst/src/nodes/macros.rs b/native/libcst/src/nodes/macros.rs new file mode 100644 index 000000000..1c47e3fad --- /dev/null +++ b/native/libcst/src/nodes/macros.rs @@ -0,0 +1,33 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +/// Generates a function that lazily imports and caches a module's member. This will hold a +/// permanent reference to the imported member. Python's module cache is rarely purged though, so +/// it typically won't matter. +/// +/// This cache is cheaper than looking up the module in python's module cache inspecting the +/// module's `__dict__` each time you want access to the member. +/// +/// If you have multiple imports from the same module, we'll call `py.import` once for each member +/// of the module. +#[macro_export] +macro_rules! py_import { + ( $module_name:expr, $member_name:expr, $getter_fn:ident ) => { + paste::paste! { + static [] + : pyo3::once_cell::GILOnceCell> + = pyo3::once_cell::GILOnceCell::new(); + + fn $getter_fn<'py>(py: pyo3::Python<'py>) -> pyo3::PyResult<&'py pyo3::PyAny> { + Ok([].get_or_init(py, || { + Ok(py.import($module_name)?.get($member_name)?.to_object(py)) + }) + .as_ref() + .map_err(|err| err.clone_ref(py))? + .as_ref(py)) + } + } + }; +} diff --git a/native/libcst/src/nodes/mod.rs b/native/libcst/src/nodes/mod.rs new file mode 100644 index 000000000..43981e874 --- /dev/null +++ b/native/libcst/src/nodes/mod.rs @@ -0,0 +1,43 @@ +mod whitespace; +pub use whitespace::{ + Comment, EmptyLine, Fakeness, Newline, ParenthesizableWhitespace, ParenthesizedWhitespace, + SimpleWhitespace, TrailingWhitespace, +}; +mod statement; +pub use statement::{ + AnnAssign, Annotation, AsName, Assert, Assign, AssignTarget, AssignTargetExpression, AugAssign, + Break, ClassDef, CompoundStatement, Continue, Decorator, Del, DelTargetExpression, Else, + ExceptHandler, Expr, Finally, For, FunctionDef, Global, If, Import, ImportAlias, ImportFrom, + ImportNames, IndentedBlock, NameItem, Nonlocal, OrElse, Pass, Raise, Return, + SimpleStatementLine, SimpleStatementSuite, SmallStatement, Statement, Suite, Try, While, With, + WithItem, +}; + +mod expression; +pub use expression::{ + Arg, Asynchronous, Attribute, Await, BaseSlice, BinaryOperation, BooleanOperation, Call, + CompFor, CompIf, Comparison, ComparisonTarget, ConcatenatedString, Dict, DictComp, DictElement, + Element, Ellipsis, Expression, Float, FormattedString, FormattedStringContent, + FormattedStringExpression, FormattedStringText, From, GeneratorExp, IfExp, Imaginary, Index, + Integer, Lambda, LeftCurlyBrace, LeftParen, LeftSquareBracket, List, ListComp, Name, + NameOrAttribute, NamedExpr, Param, ParamSlash, ParamStar, Parameters, RightCurlyBrace, + RightParen, RightSquareBracket, Set, SetComp, SimpleString, Slice, StarArg, StarredDictElement, + StarredElement, String, Subscript, SubscriptElement, Tuple, UnaryOperation, Yield, YieldValue, +}; + +mod op; +pub use op::{ + AssignEqual, AugOp, BinaryOp, BooleanOp, Colon, Comma, CompOp, Dot, ImportStar, Semicolon, + UnaryOp, +}; + +mod module; +pub use module::Module; + +mod codegen; +pub use codegen::{Codegen, CodegenState}; + +mod traits; +pub use traits::{Inflate, ParenthesizedNode, WithComma, WithLeadingLines}; + +pub(crate) mod inflate_helpers; diff --git a/native/libcst/src/nodes/module.rs b/native/libcst/src/nodes/module.rs new file mode 100644 index 000000000..8040be64b --- /dev/null +++ b/native/libcst/src/nodes/module.rs @@ -0,0 +1,92 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::mem::swap; +use std::rc::Rc; + +use crate::tokenizer::whitespace_parser::parse_empty_lines; +use crate::tokenizer::Token; +use crate::{ + nodes::{ + codegen::{Codegen, CodegenState}, + statement::Statement, + whitespace::EmptyLine, + }, + tokenizer::whitespace_parser::Config, +}; +use libcst_derive::IntoPy; + +use super::traits::{Inflate, Result, WithLeadingLines}; + +type TokenRef<'a> = Rc>; + +#[derive(Debug, Eq, PartialEq, IntoPy)] +pub struct Module<'a> { + pub body: Vec>, + pub header: Vec>, + pub footer: Vec>, + + pub default_indent: &'a str, + pub default_newline: &'a str, + pub has_trailing_newline: bool, + pub encoding: String, + + pub(crate) eof_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Module<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for h in &self.header { + h.codegen(state); + } + for s in &self.body { + s.codegen(state); + } + for nl in &self.footer { + nl.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for Module<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.default_indent = config.default_indent; + self.default_newline = config.default_newline; + self.has_trailing_newline = config.has_trailing_newline(); + self.body = self.body.inflate(config)?; + let mut footer = parse_empty_lines( + config, + &mut (*self.eof_tok).whitespace_before.borrow_mut(), + Some(""), + )?; + let mut header = vec![]; + if let Some(stmt) = self.body.first_mut() { + swap(stmt.leading_lines(), &mut header); + let mut last_indented = None; + for (num, line) in footer.iter().enumerate() { + if !line.whitespace.0.is_empty() { + last_indented = Some(num); + } else if line.comment.is_some() { + // This is a non-indented comment. Everything from here should belong in the + // footer. + break; + } + } + if let Some(num) = last_indented { + if num + 1 == footer.len() { + footer = vec![]; + } else { + let (_, rest) = footer.split_at(num + 1); + footer = rest.to_vec(); + } + } + } else { + swap(&mut header, &mut footer); + } + self.footer = footer; + self.header = header; + Ok(self) + } +} diff --git a/native/libcst/src/nodes/op.rs b/native/libcst/src/nodes/op.rs new file mode 100644 index 000000000..ef09e0a08 --- /dev/null +++ b/native/libcst/src/nodes/op.rs @@ -0,0 +1,1420 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::rc::Rc; + +use super::{whitespace::ParenthesizableWhitespace, Codegen, CodegenState}; +use crate::{ + nodes::traits::{Inflate, Result}, + tokenizer::{ + whitespace_parser::{parse_parenthesizable_whitespace, parse_simple_whitespace, Config}, + Token, + }, +}; +use libcst_derive::IntoPy; + +type TokenRef<'a> = Rc>; + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub struct Semicolon<'a> { + /// Any space that appears directly before this semicolon. + pub whitespace_before: ParenthesizableWhitespace<'a>, + /// Any space that appears directly after this semicolon. + pub whitespace_after: ParenthesizableWhitespace<'a>, + + #[skip_py] + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Semicolon<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token(";"); + self.whitespace_after.codegen(state); + } +} + +impl<'a> Inflate<'a> for Semicolon<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = ParenthesizableWhitespace::SimpleWhitespace( + parse_simple_whitespace(config, &mut (*self.tok).whitespace_before.borrow_mut())?, + ); + self.whitespace_after = ParenthesizableWhitespace::SimpleWhitespace( + parse_simple_whitespace(config, &mut (*self.tok).whitespace_after.borrow_mut())?, + ); + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Comma<'a> { + /// Any space that appears directly before this comma. + pub whitespace_before: ParenthesizableWhitespace<'a>, + /// Any space that appears directly after this comma. + pub whitespace_after: ParenthesizableWhitespace<'a>, + + #[skip_py] + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Comma<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token(","); + self.whitespace_after.codegen(state); + } +} + +impl<'a> Inflate<'a> for Comma<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_after.borrow_mut(), + )?; + Ok(self) + } +} + +impl<'a> Comma<'a> { + pub fn inflate_before(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct AssignEqual<'a> { + /// Any space that appears directly before this equal sign. + pub whitespace_before: ParenthesizableWhitespace<'a>, + /// Any space that appears directly after this equal sign. + pub whitespace_after: ParenthesizableWhitespace<'a>, + + #[skip_py] + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for AssignEqual<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token("="); + self.whitespace_after.codegen(state); + } +} + +impl<'a> Inflate<'a> for AssignEqual<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_after.borrow_mut(), + )?; + Ok(self) + } +} + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub struct Dot<'a> { + /// Any space that appears directly before this dot. + pub whitespace_before: ParenthesizableWhitespace<'a>, + /// Any space that appears directly after this dot. + pub whitespace_after: ParenthesizableWhitespace<'a>, + + #[skip_py] + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Dot<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token("."); + self.whitespace_after.codegen(state); + } +} + +impl<'a> Inflate<'a> for Dot<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.inflate_before(config)?; + self.inflate_after(config)?; + Ok(self) + } +} + +impl<'a> Dot<'a> { + fn inflate_before(&mut self, config: &Config<'a>) -> Result<()> { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?; + Ok(()) + } + + fn inflate_after(&mut self, config: &Config<'a>) -> Result<()> { + self.whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_after.borrow_mut(), + )?; + Ok(()) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct ImportStar {} + +impl<'a> Codegen<'a> for ImportStar { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("*"); + } +} + +impl<'a> Inflate<'a> for ImportStar { + fn inflate(self, _config: &Config<'a>) -> Result { + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub enum UnaryOp<'a> { + Plus { + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Minus { + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + BitInvert { + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Not { + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, +} + +impl<'a> Codegen<'a> for UnaryOp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + let (tok, whitespace_after) = match self { + Self::Plus { + whitespace_after, .. + } => ("+", whitespace_after), + Self::Minus { + whitespace_after, .. + } => ("-", whitespace_after), + Self::BitInvert { + whitespace_after, .. + } => ("~", whitespace_after), + Self::Not { + whitespace_after, .. + } => ("not", whitespace_after), + }; + state.add_token(tok); + whitespace_after.codegen(state); + } +} + +impl<'a> Inflate<'a> for UnaryOp<'a> { + fn inflate(self, config: &Config<'a>) -> Result { + Ok(match self { + Self::Plus { tok, .. } => { + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Plus { + whitespace_after, + tok, + } + } + Self::Minus { tok, .. } => { + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Minus { + whitespace_after, + tok, + } + } + Self::BitInvert { tok, .. } => { + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::BitInvert { + whitespace_after, + tok, + } + } + Self::Not { tok, .. } => { + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Not { + whitespace_after, + tok, + } + } + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub enum BooleanOp<'a> { + And { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Or { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, +} + +impl<'a> Codegen<'a> for BooleanOp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + let (tok, ws_bef, ws_aft) = match self { + Self::And { + whitespace_after, + whitespace_before, + .. + } => ("and", whitespace_before, whitespace_after), + Self::Or { + whitespace_after, + whitespace_before, + .. + } => ("or", whitespace_before, whitespace_after), + }; + ws_bef.codegen(state); + state.add_token(tok); + ws_aft.codegen(state); + } +} + +impl<'a> Inflate<'a> for BooleanOp<'a> { + fn inflate(self, config: &Config<'a>) -> Result { + Ok(match self { + Self::And { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::And { + whitespace_before, + whitespace_after, + tok, + } + } + Self::Or { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Or { + whitespace_before, + whitespace_after, + tok, + } + } + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub enum BinaryOp<'a> { + Add { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Subtract { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Multiply { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Divide { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + FloorDivide { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Modulo { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Power { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + LeftShift { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + RightShift { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + BitOr { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + BitAnd { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + BitXor { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + MatrixMultiply { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, +} + +impl<'a> Codegen<'a> for BinaryOp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + let (whitespace_before, whitespace_after, tok) = match self { + Self::Add { + whitespace_before, + whitespace_after, + tok, + } + | Self::Subtract { + whitespace_before, + whitespace_after, + tok, + } + | Self::Multiply { + whitespace_before, + whitespace_after, + tok, + } + | Self::Divide { + whitespace_before, + whitespace_after, + tok, + } + | Self::FloorDivide { + whitespace_before, + whitespace_after, + tok, + } + | Self::Modulo { + whitespace_before, + whitespace_after, + tok, + } + | Self::Power { + whitespace_before, + whitespace_after, + tok, + } + | Self::LeftShift { + whitespace_before, + whitespace_after, + tok, + } + | Self::RightShift { + whitespace_before, + whitespace_after, + tok, + } + | Self::BitOr { + whitespace_before, + whitespace_after, + tok, + } + | Self::BitAnd { + whitespace_before, + whitespace_after, + tok, + } + | Self::BitXor { + whitespace_before, + whitespace_after, + tok, + } + | Self::MatrixMultiply { + whitespace_before, + whitespace_after, + tok, + } => (whitespace_before, whitespace_after, tok), + }; + whitespace_before.codegen(state); + state.add_token(tok.string); + whitespace_after.codegen(state); + } +} + +impl<'a> Inflate<'a> for BinaryOp<'a> { + fn inflate(self, config: &Config<'a>) -> Result { + Ok(match self { + Self::Add { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Add { + whitespace_before, + whitespace_after, + tok, + } + } + Self::Subtract { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Subtract { + whitespace_before, + whitespace_after, + tok, + } + } + Self::Multiply { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Multiply { + whitespace_before, + whitespace_after, + tok, + } + } + Self::Divide { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Divide { + whitespace_before, + whitespace_after, + tok, + } + } + Self::FloorDivide { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::FloorDivide { + whitespace_before, + whitespace_after, + tok, + } + } + Self::Modulo { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Modulo { + whitespace_before, + whitespace_after, + tok, + } + } + Self::Power { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Power { + whitespace_before, + whitespace_after, + tok, + } + } + Self::LeftShift { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::LeftShift { + whitespace_before, + whitespace_after, + tok, + } + } + Self::RightShift { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::RightShift { + whitespace_before, + whitespace_after, + tok, + } + } + Self::BitOr { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::BitOr { + whitespace_before, + whitespace_after, + tok, + } + } + Self::BitAnd { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::BitAnd { + whitespace_before, + whitespace_after, + tok, + } + } + Self::BitXor { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::BitXor { + whitespace_before, + whitespace_after, + tok, + } + } + Self::MatrixMultiply { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::MatrixMultiply { + whitespace_before, + whitespace_after, + tok, + } + } + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub enum CompOp<'a> { + LessThan { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + GreaterThan { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + LessThanEqual { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + GreaterThanEqual { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + Equal { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + NotEqual { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + In { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + NotIn { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_between: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + not_tok: TokenRef<'a>, + #[skip_py] + in_tok: TokenRef<'a>, + }, + Is { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + IsNot { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_between: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + is_tok: TokenRef<'a>, + #[skip_py] + not_tok: TokenRef<'a>, + }, +} + +impl<'a> Codegen<'a> for CompOp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + let (bef, aft, first_tok, between) = match self { + Self::LessThan { + whitespace_before, + whitespace_after, + tok, + } + | Self::GreaterThan { + whitespace_before, + whitespace_after, + tok, + } + | Self::LessThanEqual { + whitespace_before, + whitespace_after, + tok, + } + | Self::GreaterThanEqual { + whitespace_before, + whitespace_after, + tok, + } + | Self::Equal { + whitespace_before, + whitespace_after, + tok, + } + | Self::NotEqual { + whitespace_before, + whitespace_after, + tok, + } + | Self::In { + whitespace_before, + whitespace_after, + tok, + } + | Self::Is { + whitespace_before, + whitespace_after, + tok, + } => (whitespace_before, whitespace_after, tok, None), + Self::IsNot { + whitespace_before, + whitespace_between, + whitespace_after, + is_tok, + not_tok, + } => ( + whitespace_before, + whitespace_after, + is_tok, + Some((whitespace_between, not_tok)), + ), + Self::NotIn { + whitespace_before, + whitespace_between, + whitespace_after, + not_tok, + in_tok, + } => ( + whitespace_before, + whitespace_after, + not_tok, + Some((whitespace_between, in_tok)), + ), + }; + bef.codegen(state); + state.add_token(first_tok.string); + if let Some((btw, second_tok)) = between { + btw.codegen(state); + state.add_token(second_tok.string); + } + aft.codegen(state); + } +} + +impl<'a> Inflate<'a> for CompOp<'a> { + fn inflate(self, config: &Config<'a>) -> Result { + Ok(match self { + Self::LessThan { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::LessThan { + whitespace_before, + whitespace_after, + tok, + } + } + Self::GreaterThan { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::GreaterThan { + whitespace_before, + whitespace_after, + tok, + } + } + Self::LessThanEqual { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::LessThanEqual { + whitespace_before, + whitespace_after, + tok, + } + } + Self::GreaterThanEqual { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::GreaterThanEqual { + whitespace_before, + whitespace_after, + tok, + } + } + Self::Equal { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Equal { + whitespace_before, + whitespace_after, + tok, + } + } + Self::NotEqual { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::NotEqual { + whitespace_before, + whitespace_after, + tok, + } + } + Self::In { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::In { + whitespace_before, + whitespace_after, + tok, + } + } + Self::Is { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::Is { + whitespace_before, + whitespace_after, + tok, + } + } + Self::IsNot { + is_tok, not_tok, .. + } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*is_tok).whitespace_before.borrow_mut(), + )?; + let whitespace_between = parse_parenthesizable_whitespace( + config, + &mut (*is_tok).whitespace_after.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*not_tok).whitespace_after.borrow_mut(), + )?; + Self::IsNot { + whitespace_before, + whitespace_between, + whitespace_after, + is_tok, + not_tok, + } + } + Self::NotIn { + not_tok, in_tok, .. + } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*not_tok).whitespace_before.borrow_mut(), + )?; + let whitespace_between = parse_parenthesizable_whitespace( + config, + &mut (*not_tok).whitespace_after.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*in_tok).whitespace_after.borrow_mut(), + )?; + Self::NotIn { + whitespace_before, + whitespace_between, + whitespace_after, + not_tok, + in_tok, + } + } + }) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Colon<'a> { + pub whitespace_before: ParenthesizableWhitespace<'a>, + pub whitespace_after: ParenthesizableWhitespace<'a>, + + #[skip_py] + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Colon<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_after.borrow_mut(), + )?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Colon<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before.codegen(state); + state.add_token(":"); + self.whitespace_after.codegen(state); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub enum AugOp<'a> { + AddAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + SubtractAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + MultiplyAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + MatrixMultiplyAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + DivideAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + ModuloAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + BitAndAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + BitOrAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + BitXorAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + LeftShiftAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + RightShiftAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + PowerAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, + FloorDivideAssign { + whitespace_before: ParenthesizableWhitespace<'a>, + whitespace_after: ParenthesizableWhitespace<'a>, + #[skip_py] + tok: TokenRef<'a>, + }, +} + +impl<'a> Inflate<'a> for AugOp<'a> { + fn inflate(self, config: &Config<'a>) -> Result { + Ok(match self { + Self::AddAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::AddAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::SubtractAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::SubtractAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::MultiplyAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::MultiplyAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::MatrixMultiplyAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::MatrixMultiplyAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::DivideAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::DivideAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::ModuloAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::ModuloAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::BitAndAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::BitAndAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::BitOrAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::BitOrAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::BitXorAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::BitXorAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::LeftShiftAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::LeftShiftAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::RightShiftAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::RightShiftAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::PowerAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::PowerAssign { + whitespace_before, + whitespace_after, + tok, + } + } + Self::FloorDivideAssign { tok, .. } => { + let whitespace_before = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_before.borrow_mut(), + )?; + let whitespace_after = parse_parenthesizable_whitespace( + config, + &mut (*tok).whitespace_after.borrow_mut(), + )?; + Self::FloorDivideAssign { + whitespace_before, + whitespace_after, + tok, + } + } + }) + } +} + +impl<'a> Codegen<'a> for AugOp<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + let (tok, bef, aft) = match self { + Self::AddAssign { + whitespace_before, + whitespace_after, + .. + } => ("+=", whitespace_before, whitespace_after), + Self::SubtractAssign { + whitespace_before, + whitespace_after, + .. + } => ("-=", whitespace_before, whitespace_after), + Self::MultiplyAssign { + whitespace_before, + whitespace_after, + .. + } => ("*=", whitespace_before, whitespace_after), + Self::MatrixMultiplyAssign { + whitespace_before, + whitespace_after, + .. + } => ("@=", whitespace_before, whitespace_after), + Self::DivideAssign { + whitespace_before, + whitespace_after, + .. + } => ("/=", whitespace_before, whitespace_after), + Self::ModuloAssign { + whitespace_before, + whitespace_after, + .. + } => ("%=", whitespace_before, whitespace_after), + Self::BitAndAssign { + whitespace_before, + whitespace_after, + .. + } => ("&=", whitespace_before, whitespace_after), + Self::BitOrAssign { + whitespace_before, + whitespace_after, + .. + } => ("|=", whitespace_before, whitespace_after), + Self::BitXorAssign { + whitespace_before, + whitespace_after, + .. + } => ("^=", whitespace_before, whitespace_after), + Self::LeftShiftAssign { + whitespace_before, + whitespace_after, + .. + } => ("<<=", whitespace_before, whitespace_after), + Self::RightShiftAssign { + whitespace_before, + whitespace_after, + .. + } => (">>=", whitespace_before, whitespace_after), + Self::PowerAssign { + whitespace_before, + whitespace_after, + .. + } => ("**=", whitespace_before, whitespace_after), + Self::FloorDivideAssign { + whitespace_before, + whitespace_after, + .. + } => ("//=", whitespace_before, whitespace_after), + }; + bef.codegen(state); + state.add_token(tok); + aft.codegen(state); + } +} diff --git a/native/libcst/src/nodes/parser_config.rs b/native/libcst/src/nodes/parser_config.rs new file mode 100644 index 000000000..3b85eae28 --- /dev/null +++ b/native/libcst/src/nodes/parser_config.rs @@ -0,0 +1,137 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use pyo3::exceptions::PyIndexError; +use pyo3::prelude::*; +use pyo3::types::{IntoPyDict, PyDict, PySequence, PyString}; +use pyo3::wrap_pyfunction; + +use crate::py_cached::PyCached; + +#[pyclass(subclass, module = "libcst_native.parser_config")] +#[text_signature = "(*, lines, default_newline)"] +pub struct BaseWhitespaceParserConfig { + pub lines: PyCached>, + pub default_newline: PyCached, +} + +#[pymethods] +impl BaseWhitespaceParserConfig { + #[new] + fn new(lines: &PySequence, default_newline: &PyString) -> PyResult { + // These fields will get initialized when ParserConfig.__init__ (our subclass) runs + Ok(Self { + lines: lines.extract()?, + default_newline: default_newline.extract()?, + }) + } + + #[getter] + fn get_lines(&self, py: Python) -> PyObject { + self.lines.to_object(py) + } + + #[getter] + fn get_default_newline(&self, py: Python) -> PyObject { + self.default_newline.to_object(py) + } +} + +impl BaseWhitespaceParserConfig { + /// Equivalent to `config.lines.unwrap()[line_number - 1]`, but it return a PyErr when we get + /// an index that's out of range, instead of panicing. + pub fn get_line(&self, line_number: usize) -> PyResult<&str> { + let err_fn = + || PyIndexError::new_err(format!("line number of {} is out of range", line_number)); + self.lines + .get(line_number.checked_sub(1).ok_or_else(err_fn)?) + .map(|l| &l[..]) + .ok_or_else(err_fn) + } + + /// Equivalent to `config.get_line(line_number)[column_index..]`, but it return a PyErr when + /// we get an column index that's out of range, instead of panicing. + pub fn get_line_after_column(&self, line_number: usize, column_index: usize) -> PyResult<&str> { + self.get_line(line_number)? + .get(column_index..) + .ok_or_else(|| { + PyIndexError::new_err(format!("column index of {} is out of range", column_index)) + }) + } +} + +// These fields are private and PyObject, since we don't currently care about using them from +// within rust. +#[pyclass(extends=BaseWhitespaceParserConfig, module="libcst_native.parser_config")] +#[text_signature = "(*, lines, encoding, default_indent, default_newline, has_trailing_newline, version, future_imports)"] +pub struct ParserConfig { + // lines is inherited + #[pyo3(get)] + encoding: PyObject, + #[pyo3(get)] + default_indent: PyObject, + // default_newline is inherited + #[pyo3(get)] + has_trailing_newline: PyObject, + #[pyo3(get)] + version: PyObject, + #[pyo3(get)] + future_imports: PyObject, +} + +#[pymethods] +impl ParserConfig { + #[new] + fn new( + lines: &PySequence, + encoding: PyObject, + default_indent: PyObject, + default_newline: &PyString, + has_trailing_newline: PyObject, + version: PyObject, + future_imports: PyObject, + ) -> PyResult<(Self, BaseWhitespaceParserConfig)> { + Ok(( + Self { + encoding, + default_indent, + has_trailing_newline, + version, + future_imports, + }, + BaseWhitespaceParserConfig::new(lines, default_newline)?, + )) + } +} + +/// An internal helper function used by python unit tests to compare configs. +#[pyfunction] +fn parser_config_asdict<'py>(py: Python<'py>, config: PyRef<'py, ParserConfig>) -> &'py PyDict { + let super_config: &BaseWhitespaceParserConfig = config.as_ref(); + vec![ + ("lines", super_config.lines.to_object(py)), + ("encoding", config.encoding.clone_ref(py)), + ("default_indent", config.default_indent.clone_ref(py)), + ( + "default_newline", + super_config.default_newline.to_object(py), + ), + ( + "has_trailing_newline", + config.has_trailing_newline.clone_ref(py), + ), + ("version", config.version.clone_ref(py)), + ("future_imports", config.future_imports.clone_ref(py)), + ] + .into_py_dict(py) +} + +pub fn init_module(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_function(wrap_pyfunction!(parser_config_asdict, m)?) + .unwrap(); + Ok(self) +} diff --git a/native/libcst/src/nodes/py_cached.rs b/native/libcst/src/nodes/py_cached.rs new file mode 100644 index 000000000..e8a4dfd4a --- /dev/null +++ b/native/libcst/src/nodes/py_cached.rs @@ -0,0 +1,76 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use pyo3::prelude::*; +use std::convert::AsRef; +use std::ops::Deref; + +/// An immutable wrapper around a rust type T and it's PyObject equivalent. Caches the conversion +/// to and from the PyObject. +pub struct PyCached { + native: T, + py_object: PyObject, +} + +impl PyCached +where + T: ToPyObject, +{ + pub fn new(py: Python, native: T) -> Self { + Self { + py_object: native.to_object(py), + native, + } + } +} + +impl<'source, T> FromPyObject<'source> for PyCached +where + T: FromPyObject<'source>, +{ + fn extract(ob: &'source PyAny) -> PyResult { + Python::with_gil(|py| { + Ok(PyCached { + native: ob.extract()?, + py_object: ob.to_object(py), + }) + }) + } +} + +impl IntoPy for PyCached { + fn into_py(self, _py: Python) -> PyObject { + self.py_object + } +} + +impl ToPyObject for PyCached { + fn to_object(&self, py: Python) -> PyObject { + self.py_object.clone_ref(py) + } +} + +impl AsRef for PyCached { + fn as_ref(&self) -> &T { + &self.native + } +} + +impl Deref for PyCached { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.native + } +} + +impl From for PyCached +where + T: ToPyObject, +{ + fn from(val: T) -> Self { + Python::with_gil(|py| Self::new(py, val)) + } +} diff --git a/native/libcst/src/nodes/statement.rs b/native/libcst/src/nodes/statement.rs new file mode 100644 index 000000000..63c8ab8ae --- /dev/null +++ b/native/libcst/src/nodes/statement.rs @@ -0,0 +1,1986 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::{mem::swap, rc::Rc}; + +use super::{ + inflate_helpers::adjust_parameters_trailing_whitespace, Attribute, Codegen, CodegenState, + Comma, Dot, EmptyLine, Expression, From, ImportStar, LeftParen, List, Name, NameOrAttribute, + Parameters, ParenthesizableWhitespace, RightParen, Semicolon, SimpleWhitespace, StarredElement, + Subscript, TrailingWhitespace, Tuple, +}; +use crate::{ + nodes::{ + traits::{Inflate, Result, WithComma, WithLeadingLines}, + Arg, AssignEqual, Asynchronous, AugOp, Element, ParenthesizedNode, + }, + tokenizer::{ + whitespace_parser::{ + parse_empty_lines, parse_parenthesizable_whitespace, parse_simple_whitespace, + parse_trailing_whitespace, Config, + }, + Token, + }, +}; +use libcst_derive::{Codegen, Inflate, IntoPy, ParenthesizedNode}; + +type TokenRef<'a> = Rc>; + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, Eq, PartialEq, Clone, Inflate, Codegen, IntoPy)] +pub enum Statement<'a> { + Simple(SimpleStatementLine<'a>), + Compound(CompoundStatement<'a>), +} + +impl<'a> WithLeadingLines<'a> for Statement<'a> { + fn leading_lines(&mut self) -> &mut Vec> { + match self { + Self::Simple(s) => &mut s.leading_lines, + Self::Compound(c) => c.leading_lines(), + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Inflate, Codegen, IntoPy)] +#[allow(clippy::large_enum_variant)] +pub enum CompoundStatement<'a> { + FunctionDef(FunctionDef<'a>), + If(If<'a>), + For(For<'a>), + While(While<'a>), + ClassDef(ClassDef<'a>), + Try(Try<'a>), + With(With<'a>), +} + +impl<'a> WithLeadingLines<'a> for CompoundStatement<'a> { + fn leading_lines(&mut self) -> &mut Vec> { + match self { + Self::FunctionDef(f) => &mut f.leading_lines, + Self::If(f) => &mut f.leading_lines, + Self::For(f) => &mut f.leading_lines, + Self::While(f) => &mut f.leading_lines, + Self::ClassDef(c) => &mut c.leading_lines, + Self::Try(t) => &mut t.leading_lines, + Self::With(w) => &mut w.leading_lines, + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Inflate, Codegen, IntoPy)] +pub enum Suite<'a> { + IndentedBlock(IndentedBlock<'a>), + SimpleStatementSuite(SimpleStatementSuite<'a>), +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct IndentedBlock<'a> { + /// Sequence of statements belonging to this indented block. + pub body: Vec>, + /// Any optional trailing comment and the final ``NEWLINE`` at the end of the line. + pub header: TrailingWhitespace<'a>, + /// A string represents a specific indentation. A ``None`` value uses the modules's + /// default indentation. This is included because indentation is allowed to be + /// inconsistent across a file, just not ambiguously. + pub indent: Option<&'a str>, + /// Any trailing comments or lines after the dedent that are owned by this indented + /// block. Statements own preceeding and same-line trailing comments, but not + /// trailing lines, so it falls on :class:`IndentedBlock` to own it. In the case + /// that a statement follows an :class:`IndentedBlock`, that statement will own the + /// comments and lines that are at the same indent as the statement, and this + /// :class:`IndentedBlock` will own the comments and lines that are indented + /// further. + pub footer: Vec>, + + pub(crate) newline_tok: TokenRef<'a>, + pub(crate) indent_tok: TokenRef<'a>, + pub(crate) dedent_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for IndentedBlock<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.header.codegen(state); + + let indent = match self.indent { + Some(i) => i, + None => state.default_indent, + }; + state.indent(indent); + + if self.body.is_empty() { + // Empty indented blocks are not syntactically valid in Python unless they + // contain a 'pass' statement, so add one here. + state.add_indent(); + state.add_token("pass"); + state.add_token(state.default_newline); + } else { + for stmt in &self.body { + // IndentedBlock is responsible for adjusting the current indentation + // level, but its children are responsible for actually adding that + // indentation to the token list. + stmt.codegen(state); + } + } + + for f in &self.footer { + f.codegen(state); + } + + state.dedent(); + } +} + +impl<'a> Inflate<'a> for IndentedBlock<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.body = self.body.inflate(config)?; + // We want to be able to only keep comments in the footer that are actually for + // this IndentedBlock. We do so by assuming that lines which are indented to the + // same level as the block itself are comments that go at the footer of the + // block. Comments that are indented to less than this indent are assumed to + // belong to the next line of code. We override the indent here because the + // dedent node's absolute indent is the resulting indentation after the dedent + // is performed. Its this way because the whitespace state for both the dedent's + // whitespace_after and the next BaseCompoundStatement's whitespace_before is + // shared. This allows us to partially parse here and parse the rest of the + // whitespace and comments on the next line, effectively making sure that + // comments are attached to the correct node. + let footer = parse_empty_lines( + config, + &mut (*self.dedent_tok).whitespace_after.borrow_mut(), + Some(self.indent_tok.whitespace_before.borrow().absolute_indent), + )?; + let header = parse_trailing_whitespace( + config, + &mut (*self.newline_tok).whitespace_before.borrow_mut(), + )?; + self.footer = footer; + self.header = header; + self.indent = self.indent_tok.relative_indent; + if self.indent == Some(config.default_indent) { + self.indent = None; + } + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct SimpleStatementSuite<'a> { + /// Sequence of small statements. All but the last statement are required to have + /// a semicolon. + pub body: Vec>, + + /// The whitespace between the colon in the parent statement and the body. + pub leading_whitespace: SimpleWhitespace<'a>, + /// Any optional trailing comment and the final ``NEWLINE`` at the end of the line. + pub trailing_whitespace: TrailingWhitespace<'a>, + + pub(crate) first_tok: TokenRef<'a>, + pub(crate) newline_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for SimpleStatementSuite<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_whitespace = parse_simple_whitespace( + config, + &mut (*self.first_tok).whitespace_before.borrow_mut(), + )?; + self.body = self.body.inflate(config)?; + self.trailing_whitespace = parse_trailing_whitespace( + config, + &mut (*self.newline_tok).whitespace_before.borrow_mut(), + )?; + Ok(self) + } +} + +fn _simple_statement_codegen<'a>( + body: &[SmallStatement<'a>], + trailing_whitespace: &TrailingWhitespace<'a>, + state: &mut CodegenState<'a>, +) { + for stmt in body { + stmt.codegen(state); + // TODO: semicolon + } + if body.is_empty() { + // Empty simple statement blocks are not syntactically valid in Python + // unless they contain a 'pass' statement, so add one here. + state.add_token("pass") + } + trailing_whitespace.codegen(state); +} + +impl<'a> Codegen<'a> for SimpleStatementSuite<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.leading_whitespace.codegen(state); + _simple_statement_codegen(&self.body, &self.trailing_whitespace, state); + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct SimpleStatementLine<'a> { + /// Sequence of small statements. All but the last statement are required to have + /// a semicolon. + pub body: Vec>, + + /// Sequence of empty lines appearing before this simple statement line. + pub leading_lines: Vec>, + /// Any optional trailing comment and the final ``NEWLINE`` at the end of the line. + pub trailing_whitespace: TrailingWhitespace<'a>, + + pub(crate) first_tok: TokenRef<'a>, + pub(crate) newline_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for SimpleStatementLine<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for line in &self.leading_lines { + line.codegen(state); + } + state.add_indent(); + _simple_statement_codegen(&self.body, &self.trailing_whitespace, state); + } +} + +impl<'a> Inflate<'a> for SimpleStatementLine<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.first_tok).whitespace_before.borrow_mut(), + None, + )?; + self.body = self.body.inflate(config)?; + self.trailing_whitespace = parse_trailing_whitespace( + config, + &mut (*self.newline_tok).whitespace_before.borrow_mut(), + )?; + Ok(self) + } +} + +#[allow(dead_code, clippy::large_enum_variant)] +#[derive(Debug, Eq, PartialEq, Clone, Codegen, Inflate, IntoPy)] +pub enum SmallStatement<'a> { + Pass(Pass<'a>), + Break(Break<'a>), + Continue(Continue<'a>), + Return(Return<'a>), + Expr(Expr<'a>), + Assert(Assert<'a>), + Import(Import<'a>), + ImportFrom(ImportFrom<'a>), + Assign(Assign<'a>), + AnnAssign(AnnAssign<'a>), + Raise(Raise<'a>), + Global(Global<'a>), + Nonlocal(Nonlocal<'a>), + AugAssign(AugAssign<'a>), + Del(Del<'a>), +} + +impl<'a> SmallStatement<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + match self { + Self::Pass(p) => Self::Pass(p.with_semicolon(semicolon)), + Self::Break(p) => Self::Break(p.with_semicolon(semicolon)), + Self::Continue(p) => Self::Continue(p.with_semicolon(semicolon)), + Self::Expr(p) => Self::Expr(p.with_semicolon(semicolon)), + Self::Import(i) => Self::Import(i.with_semicolon(semicolon)), + Self::ImportFrom(i) => Self::ImportFrom(i.with_semicolon(semicolon)), + Self::Assign(a) => Self::Assign(a.with_semicolon(semicolon)), + Self::AnnAssign(a) => Self::AnnAssign(a.with_semicolon(semicolon)), + Self::Return(r) => Self::Return(r.with_semicolon(semicolon)), + Self::Assert(a) => Self::Assert(a.with_semicolon(semicolon)), + Self::Raise(r) => Self::Raise(r.with_semicolon(semicolon)), + Self::Global(g) => Self::Global(g.with_semicolon(semicolon)), + Self::Nonlocal(l) => Self::Nonlocal(l.with_semicolon(semicolon)), + Self::AugAssign(a) => Self::AugAssign(a.with_semicolon(semicolon)), + Self::Del(d) => Self::Del(d.with_semicolon(semicolon)), + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Pass<'a> { + pub semicolon: Option>, +} +impl<'a> Pass<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon } + } +} +impl<'a> Codegen<'a> for Pass<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("pass"); + self.semicolon.codegen(state); + } +} +impl<'a> Inflate<'a> for Pass<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Break<'a> { + pub semicolon: Option>, +} +impl<'a> Break<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon } + } +} +impl<'a> Codegen<'a> for Break<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("break"); + self.semicolon.codegen(state); + } +} +impl<'a> Inflate<'a> for Break<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Continue<'a> { + pub semicolon: Option>, +} +impl<'a> Continue<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon } + } +} +impl<'a> Codegen<'a> for Continue<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("continue"); + self.semicolon.codegen(state); + } +} +impl<'a> Inflate<'a> for Continue<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Expr<'a> { + pub value: Expression<'a>, + pub semicolon: Option>, +} +impl<'a> Expr<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} +impl<'a> Codegen<'a> for Expr<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.value.codegen(state); + self.semicolon.codegen(state); + } +} +impl<'a> Inflate<'a> for Expr<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.value = self.value.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Assign<'a> { + pub targets: Vec>, + pub value: Expression<'a>, + pub semicolon: Option>, +} + +impl<'a> Codegen<'a> for Assign<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for target in &self.targets { + target.codegen(state); + } + self.value.codegen(state); + if let Some(semi) = &self.semicolon { + semi.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for Assign<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.targets = self.targets.inflate(config)?; + self.value = self.value.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> Assign<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct AssignTarget<'a> { + pub target: AssignTargetExpression<'a>, + pub whitespace_before_equal: SimpleWhitespace<'a>, + pub whitespace_after_equal: SimpleWhitespace<'a>, + + pub(crate) equal_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for AssignTarget<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.target.codegen(state); + self.whitespace_before_equal.codegen(state); + state.add_token("="); + self.whitespace_after_equal.codegen(state); + } +} + +impl<'a> Inflate<'a> for AssignTarget<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.target = self.target.inflate(config)?; + self.whitespace_before_equal = parse_simple_whitespace( + config, + &mut (*self.equal_tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after_equal = + parse_simple_whitespace(config, &mut (*self.equal_tok).whitespace_after.borrow_mut())?; + Ok(self) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, Codegen, ParenthesizedNode, Inflate, IntoPy)] +pub enum AssignTargetExpression<'a> { + Name(Name<'a>), + Attribute(Attribute<'a>), + StarredElement(StarredElement<'a>), + Tuple(Tuple<'a>), + List(List<'a>), + Subscript(Subscript<'a>), +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Import<'a> { + pub names: Vec>, + pub semicolon: Option>, + pub whitespace_after_import: SimpleWhitespace<'a>, + + pub(crate) import_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Import<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("import"); + self.whitespace_after_import.codegen(state); + for (i, name) in self.names.iter().enumerate() { + name.codegen(state); + if name.comma.is_none() && i < self.names.len() - 1 { + state.add_token(", "); + } + } + if let Some(semi) = &self.semicolon { + semi.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for Import<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after_import = parse_simple_whitespace( + config, + &mut (*self.import_tok).whitespace_after.borrow_mut(), + )?; + self.names = self.names.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> Import<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct ImportFrom<'a> { + #[no_py_default] + pub module: Option>, + pub names: ImportNames<'a>, + pub relative: Vec>, + pub lpar: Option>, + pub rpar: Option>, + pub semicolon: Option>, + pub whitespace_after_from: SimpleWhitespace<'a>, + pub whitespace_before_import: SimpleWhitespace<'a>, + pub whitespace_after_import: SimpleWhitespace<'a>, + + pub(crate) from_tok: TokenRef<'a>, + pub(crate) import_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for ImportFrom<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("from"); + self.whitespace_after_from.codegen(state); + for dot in &self.relative { + dot.codegen(state); + } + if let Some(module) = &self.module { + module.codegen(state); + } + self.whitespace_before_import.codegen(state); + state.add_token("import"); + self.whitespace_after_import.codegen(state); + if let Some(lpar) = &self.lpar { + lpar.codegen(state); + } + self.names.codegen(state); + if let Some(rpar) = &self.rpar { + rpar.codegen(state); + } + + if let Some(semi) = &self.semicolon { + semi.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for ImportFrom<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after_from = + parse_simple_whitespace(config, &mut (*self.from_tok).whitespace_after.borrow_mut())?; + + self.module = self.module.inflate(config)?; + + self.whitespace_after_import = parse_simple_whitespace( + config, + &mut (*self.import_tok).whitespace_after.borrow_mut(), + )?; + + self.relative = inflate_dots(self.relative, config)?; + + if !self.relative.is_empty() && self.module.is_none() { + // For relative-only imports relocate the space after the final dot to be owned + // by the import token. + if let Some(Dot { + whitespace_after: ParenthesizableWhitespace::SimpleWhitespace(dot_ws), + .. + }) = self.relative.last_mut() + { + swap(dot_ws, &mut self.whitespace_before_import); + } + } else { + self.whitespace_before_import = parse_simple_whitespace( + config, + &mut (*self.import_tok).whitespace_before.borrow_mut(), + )?; + } + + self.lpar = self.lpar.inflate(config)?; + self.names = self.names.inflate(config)?; + self.rpar = self.rpar.inflate(config)?; + + self.semicolon = self.semicolon.inflate(config)?; + + Ok(self) + } +} + +fn inflate_dots<'a>(dots: Vec>, config: &Config<'a>) -> Result>> { + let mut ret: Vec> = vec![]; + let mut last_tok: Option> = None; + for mut dot in dots { + if let Some(last_tokref) = &last_tok { + // Consecutive dots having the same Token can only happen if `...` was + // parsed as a single ELLIPSIS token. In this case the token's + // whitespace_before belongs to the first dot, but the whitespace_after is + // moved to the 3rd dot (by swapping it twice) + if last_tokref.start_pos == dot.tok.start_pos { + swap( + &mut ret.last_mut().unwrap().whitespace_after, + &mut dot.whitespace_after, + ); + ret.push(dot); + continue; + } + } + last_tok = Some(dot.tok.clone()); + ret.push(dot.inflate(config)?); + } + Ok(ret) +} + +impl<'a> ImportFrom<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct ImportAlias<'a> { + pub name: NameOrAttribute<'a>, + pub asname: Option>, + pub comma: Option>, +} + +impl<'a> Inflate<'a> for ImportAlias<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.name = self.name.inflate(config)?; + self.asname = self.asname.inflate(config)?; + self.comma = self.comma.inflate(config)?; + Ok(self) + } +} + +impl<'a> WithComma<'a> for ImportAlias<'a> { + fn with_comma(self, comma: Comma<'a>) -> ImportAlias<'a> { + let comma = Some(comma); + Self { comma, ..self } + } +} + +impl<'a> Codegen<'a> for ImportAlias<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.name.codegen(state); + if let Some(asname) = &self.asname { + asname.codegen(state); + } + if let Some(comma) = &self.comma { + comma.codegen(state); + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct AsName<'a> { + pub name: AssignTargetExpression<'a>, + pub whitespace_before_as: ParenthesizableWhitespace<'a>, + pub whitespace_after_as: ParenthesizableWhitespace<'a>, + + pub(crate) as_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for AsName<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace_before_as.codegen(state); + state.add_token("as"); + self.whitespace_after_as.codegen(state); + self.name.codegen(state); + } +} + +impl<'a> Inflate<'a> for AsName<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before_as = parse_parenthesizable_whitespace( + config, + &mut (*self.as_tok).whitespace_before.borrow_mut(), + )?; + self.whitespace_after_as = parse_parenthesizable_whitespace( + config, + &mut (*self.as_tok).whitespace_after.borrow_mut(), + )?; + self.name = self.name.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Inflate, IntoPy)] +pub enum ImportNames<'a> { + Star(ImportStar), + Aliases(Vec>), +} + +impl<'a> Codegen<'a> for ImportNames<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + match self { + Self::Star(s) => s.codegen(state), + Self::Aliases(aliases) => { + for (i, alias) in aliases.iter().enumerate() { + alias.codegen(state); + if alias.comma.is_none() && i < aliases.len() - 1 { + state.add_token(", "); + } + } + } + } + } +} + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub struct FunctionDef<'a> { + pub name: Name<'a>, + pub params: Parameters<'a>, + pub body: Suite<'a>, + pub decorators: Vec>, + pub returns: Option>, + pub asynchronous: Option>, + pub leading_lines: Vec>, + pub lines_after_decorators: Vec>, + pub whitespace_after_def: SimpleWhitespace<'a>, + pub whitespace_after_name: SimpleWhitespace<'a>, + pub whitespace_before_params: ParenthesizableWhitespace<'a>, + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) async_tok: Option>, + pub(crate) def_tok: TokenRef<'a>, + pub(crate) open_paren_tok: TokenRef<'a>, + pub(crate) close_paren_tok: TokenRef<'a>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> FunctionDef<'a> { + pub fn with_decorators(self, decorators: Vec>) -> Self { + Self { decorators, ..self } + } +} + +impl<'a> Codegen<'a> for FunctionDef<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for l in &self.leading_lines { + l.codegen(state); + } + for dec in self.decorators.iter() { + dec.codegen(state); + } + for l in &self.lines_after_decorators { + l.codegen(state); + } + state.add_indent(); + + if let Some(asy) = &self.asynchronous { + asy.codegen(state); + } + state.add_token("def"); + self.whitespace_after_def.codegen(state); + self.name.codegen(state); + self.whitespace_after_name.codegen(state); + state.add_token("("); + self.whitespace_before_params.codegen(state); + self.params.codegen(state); + state.add_token(")"); + + if let Some(ann) = &self.returns { + ann.codegen(state, "->"); + } + + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + } +} + +impl<'a> Inflate<'a> for FunctionDef<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.decorators = self.decorators.inflate(config)?; + let (asynchronous, leading_lines) = if let Some(asy) = self.async_tok.as_mut() { + let whitespace_after = + parse_parenthesizable_whitespace(config, &mut asy.whitespace_after.borrow_mut())?; + ( + Some(Asynchronous { whitespace_after }), + Some(parse_empty_lines( + config, + &mut asy.whitespace_before.borrow_mut(), + None, + )?), + ) + } else { + (None, None) + }; + + self.asynchronous = asynchronous; + let leading_lines = if let Some(ll) = leading_lines { + ll + } else { + parse_empty_lines( + config, + &mut (*self.def_tok).whitespace_before.borrow_mut(), + None, + )? + }; + + self.leading_lines = leading_lines; + if let Some(dec) = self.decorators.first_mut() { + swap(&mut self.lines_after_decorators, &mut self.leading_lines); + swap(&mut dec.leading_lines, &mut self.leading_lines); + } + + self.whitespace_after_def = + parse_simple_whitespace(config, &mut (*self.def_tok).whitespace_after.borrow_mut())?; + + self.name = self.name.inflate(config)?; + self.whitespace_after_name = parse_simple_whitespace( + config, + &mut (*self.open_paren_tok).whitespace_before.borrow_mut(), + )?; + + self.whitespace_before_params = parse_parenthesizable_whitespace( + config, + &mut (*self.open_paren_tok).whitespace_after.borrow_mut(), + )?; + self.params = self.params.inflate(config)?; + adjust_parameters_trailing_whitespace(config, &mut self.params, &self.close_paren_tok)?; + + self.returns = self.returns.inflate(config)?; + self.whitespace_before_colon = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + + self.body = self.body.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub struct Decorator<'a> { + pub decorator: Expression<'a>, + pub leading_lines: Vec>, + pub whitespace_after_at: SimpleWhitespace<'a>, + pub trailing_whitespace: TrailingWhitespace<'a>, + + pub(crate) at_tok: TokenRef<'a>, + pub(crate) newline_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Decorator<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for ll in self.leading_lines.iter() { + ll.codegen(state); + } + state.add_indent(); + state.add_token("@"); + self.whitespace_after_at.codegen(state); + self.decorator.codegen(state); + self.trailing_whitespace.codegen(state); + } +} + +impl<'a> Inflate<'a> for Decorator<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.at_tok).whitespace_before.borrow_mut(), + None, + )?; + self.whitespace_after_at = + parse_simple_whitespace(config, &mut (*self.at_tok).whitespace_after.borrow_mut())?; + self.decorator = self.decorator.inflate(config)?; + self.trailing_whitespace = parse_trailing_whitespace( + config, + &mut (*self.newline_tok).whitespace_before.borrow_mut(), + )?; + Ok(self) + } +} + +impl<'a> pyo3::conversion::IntoPy for Box> { + fn into_py(self, py: pyo3::Python) -> pyo3::PyObject { + (*self).into_py(py) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct If<'a> { + /// The expression that, when evaluated, should give us a truthy value + pub test: Expression<'a>, + // The body of this compound statement. + pub body: Suite<'a>, + + /// An optional ``elif`` or ``else`` clause. ``If`` signifies an ``elif`` block. + pub orelse: Option>>, + + /// Sequence of empty lines appearing before this compound statement line. + pub leading_lines: Vec>, + + /// The whitespace appearing after the ``if`` keyword but before the test + /// expression. + pub whitespace_before_test: SimpleWhitespace<'a>, + + /// The whitespace appearing after the test expression but before the colon. + pub whitespace_after_test: SimpleWhitespace<'a>, + + /// Signifies if this instance represents an ``elif`` or an ``if`` block. + #[skip_py] + pub is_elif: bool, + + pub(crate) if_tok: TokenRef<'a>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for If<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for l in &self.leading_lines { + l.codegen(state); + } + state.add_indent(); + + state.add_token(if self.is_elif { "elif" } else { "if" }); + self.whitespace_before_test.codegen(state); + self.test.codegen(state); + self.whitespace_after_test.codegen(state); + state.add_token(":"); + self.body.codegen(state); + if let Some(orelse) = &self.orelse { + orelse.codegen(state) + } + } +} + +impl<'a> Inflate<'a> for If<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.if_tok).whitespace_before.borrow_mut(), + None, + )?; + self.whitespace_before_test = + parse_simple_whitespace(config, &mut (*self.if_tok).whitespace_after.borrow_mut())?; + self.test = self.test.inflate(config)?; + self.whitespace_after_test = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + self.body = self.body.inflate(config)?; + self.orelse = self.orelse.inflate(config)?; + + Ok(self) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, Inflate, Codegen, IntoPy)] +pub enum OrElse<'a> { + Elif(If<'a>), + Else(Else<'a>), +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Else<'a> { + pub body: Suite<'a>, + /// Sequence of empty lines appearing before this compound statement line. + pub leading_lines: Vec>, + /// The whitespace appearing after the ``else`` keyword but before the colon. + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) else_tok: TokenRef<'a>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Else<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for l in &self.leading_lines { + l.codegen(state); + } + state.add_indent(); + + state.add_token("else"); + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + } +} + +impl<'a> Inflate<'a> for Else<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.else_tok).whitespace_before.borrow_mut(), + None, + )?; + self.whitespace_before_colon = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + self.body = self.body.inflate(config)?; + + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Annotation<'a> { + pub annotation: Expression<'a>, + pub whitespace_before_indicator: Option>, + pub whitespace_after_indicator: ParenthesizableWhitespace<'a>, + + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Annotation<'a> { + pub fn codegen(&self, state: &mut CodegenState<'a>, default_indicator: &'a str) { + if let Some(ws) = &self.whitespace_before_indicator { + ws.codegen(state); + } else if default_indicator == "->" { + state.add_token(" "); + } else { + panic!("Variable annotation but whitespace is None"); + } + + state.add_token(default_indicator); + self.whitespace_after_indicator.codegen(state); + self.annotation.codegen(state); + } +} + +impl<'a> Inflate<'a> for Annotation<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_before_indicator = Some(parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_before.borrow_mut(), + )?); + self.whitespace_after_indicator = parse_parenthesizable_whitespace( + config, + &mut (*self.tok).whitespace_after.borrow_mut(), + )?; + self.annotation = self.annotation.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct AnnAssign<'a> { + pub target: AssignTargetExpression<'a>, + pub annotation: Annotation<'a>, + pub value: Option>, + pub equal: Option>, + pub semicolon: Option>, +} + +impl<'a> Codegen<'a> for AnnAssign<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.target.codegen(state); + self.annotation.codegen(state, ":"); + if let Some(eq) = &self.equal { + eq.codegen(state); + } else if self.value.is_some() { + state.add_token(" = "); + } + if let Some(value) = &self.value { + value.codegen(state); + } + + if let Some(semi) = &self.semicolon { + semi.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for AnnAssign<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.target = self.target.inflate(config)?; + self.annotation = self.annotation.inflate(config)?; + self.value = self.value.inflate(config)?; + self.equal = self.equal.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> AnnAssign<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Return<'a> { + pub value: Option>, + pub whitespace_after_return: Option>, + pub semicolon: Option>, + + pub(crate) return_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Return<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("return"); + if let Some(ws) = &self.whitespace_after_return { + ws.codegen(state); + } else if self.value.is_some() { + state.add_token(" "); + } + + if let Some(val) = &self.value { + val.codegen(state); + } + if let Some(semi) = &self.semicolon { + semi.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for Return<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + if self.value.is_some() { + self.whitespace_after_return = Some(parse_simple_whitespace( + config, + &mut (*self.return_tok).whitespace_after.borrow_mut(), + )?); + } else { + // otherwise space is owned by semicolon or small statement + // whitespace is not None to preserve a quirk of the pure python parser + self.whitespace_after_return = Some(Default::default()) + } + self.value = self.value.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> Return<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Assert<'a> { + pub test: Expression<'a>, + pub msg: Option>, + pub comma: Option>, + pub whitespace_after_assert: SimpleWhitespace<'a>, + pub semicolon: Option>, + + pub(crate) assert_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Assert<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("assert"); + self.whitespace_after_assert.codegen(state); + self.test.codegen(state); + if let Some(comma) = &self.comma { + comma.codegen(state); + } else if self.msg.is_some() { + state.add_token(", "); + } + if let Some(msg) = &self.msg { + msg.codegen(state); + } + if let Some(semi) = &self.semicolon { + semi.codegen(state); + } + } +} +impl<'a> Inflate<'a> for Assert<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after_assert = parse_simple_whitespace( + config, + &mut (*self.assert_tok).whitespace_after.borrow_mut(), + )?; + + self.test = self.test.inflate(config)?; + self.comma = self.comma.inflate(config)?; + self.msg = self.msg.inflate(config)?; + + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> Assert<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Raise<'a> { + pub exc: Option>, + pub cause: Option>, + pub whitespace_after_raise: Option>, + pub semicolon: Option>, + + pub(crate) raise_tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Raise<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + if self.exc.is_some() { + self.whitespace_after_raise = Some(parse_simple_whitespace( + config, + &mut (*self.raise_tok).whitespace_after.borrow_mut(), + )?); + } + + self.exc = self.exc.inflate(config)?; + self.cause = self.cause.inflate(config)?; + if self.exc.is_none() { + if let Some(cause) = self.cause.as_mut() { + // in `raise from`, `raise` owns the shared whitespace + cause.whitespace_before_from = None; + } + } + self.semicolon = self.semicolon.inflate(config)?; + + Ok(self) + } +} + +impl<'a> Codegen<'a> for Raise<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("raise"); + if let Some(ws) = &self.whitespace_after_raise { + ws.codegen(state); + } else if self.exc.is_some() { + state.add_token(" "); + } + + if let Some(exc) = &self.exc { + exc.codegen(state); + } + + if let Some(cause) = &self.cause { + cause.codegen(state, " "); + } + + if let Some(semi) = &self.semicolon { + semi.codegen(state); + } + } +} + +impl<'a> Raise<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct NameItem<'a> { + pub name: Name<'a>, + pub comma: Option>, +} + +impl<'a> Inflate<'a> for NameItem<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.name = self.name.inflate(config)?; + self.comma = self.comma.inflate(config)?; + Ok(self) + } +} + +impl<'a> NameItem<'a> { + fn codegen(&self, state: &mut CodegenState<'a>, default_comma: bool) { + self.name.codegen(state); + if let Some(comma) = &self.comma { + comma.codegen(state); + } else if default_comma { + state.add_token(", "); + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Global<'a> { + pub names: Vec>, + pub whitespace_after_global: SimpleWhitespace<'a>, + pub semicolon: Option>, + + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Global<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after_global = + parse_simple_whitespace(config, &mut (*self.tok).whitespace_after.borrow_mut())?; + self.names = self.names.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Global<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("global"); + self.whitespace_after_global.codegen(state); + let len = self.names.len(); + for (i, name) in self.names.iter().enumerate() { + name.codegen(state, i + 1 != len); + } + + if let Some(semicolon) = &self.semicolon { + semicolon.codegen(state); + } + } +} + +impl<'a> Global<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Nonlocal<'a> { + pub names: Vec>, + pub whitespace_after_nonlocal: SimpleWhitespace<'a>, + pub semicolon: Option>, + + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Nonlocal<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after_nonlocal = + parse_simple_whitespace(config, &mut (*self.tok).whitespace_after.borrow_mut())?; + self.names = self.names.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Nonlocal<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("nonlocal"); + self.whitespace_after_nonlocal.codegen(state); + let len = self.names.len(); + for (i, name) in self.names.iter().enumerate() { + name.codegen(state, i + 1 != len); + } + + if let Some(semicolon) = &self.semicolon { + semicolon.codegen(state); + } + } +} + +impl<'a> Nonlocal<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct For<'a> { + pub target: AssignTargetExpression<'a>, + pub iter: Expression<'a>, + pub body: Suite<'a>, + pub orelse: Option>, + pub asynchronous: Option>, + + pub leading_lines: Vec>, + pub whitespace_after_for: SimpleWhitespace<'a>, + pub whitespace_before_in: SimpleWhitespace<'a>, + pub whitespace_after_in: SimpleWhitespace<'a>, + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) async_tok: Option>, + pub(crate) for_tok: TokenRef<'a>, + pub(crate) in_tok: TokenRef<'a>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for For<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for ll in &self.leading_lines { + ll.codegen(state); + } + state.add_indent(); + + if let Some(asy) = &self.asynchronous { + asy.codegen(state); + } + state.add_token("for"); + self.whitespace_after_for.codegen(state); + self.target.codegen(state); + self.whitespace_before_in.codegen(state); + state.add_token("in"); + self.whitespace_after_in.codegen(state); + self.iter.codegen(state); + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + if let Some(e) = &self.orelse { + e.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for For<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + let (asynchronous, leading_lines) = if let Some(asy) = self.async_tok.as_mut() { + let whitespace_after = + parse_parenthesizable_whitespace(config, &mut asy.whitespace_after.borrow_mut())?; + ( + Some(Asynchronous { whitespace_after }), + Some(parse_empty_lines( + config, + &mut asy.whitespace_before.borrow_mut(), + None, + )?), + ) + } else { + (None, None) + }; + self.leading_lines = if let Some(ll) = leading_lines { + ll + } else { + parse_empty_lines( + config, + &mut (*self.for_tok).whitespace_before.borrow_mut(), + None, + )? + }; + self.asynchronous = asynchronous; + self.whitespace_after_for = + parse_simple_whitespace(config, &mut (*self.for_tok).whitespace_after.borrow_mut())?; + self.target = self.target.inflate(config)?; + self.whitespace_before_in = + parse_simple_whitespace(config, &mut (*self.in_tok).whitespace_before.borrow_mut())?; + self.whitespace_after_in = + parse_simple_whitespace(config, &mut (*self.in_tok).whitespace_after.borrow_mut())?; + self.iter = self.iter.inflate(config)?; + self.whitespace_before_colon = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + + self.body = self.body.inflate(config)?; + self.orelse = self.orelse.inflate(config)?; + + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct While<'a> { + pub test: Expression<'a>, + pub body: Suite<'a>, + pub orelse: Option>, + pub leading_lines: Vec>, + pub whitespace_after_while: SimpleWhitespace<'a>, + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) while_tok: TokenRef<'a>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for While<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for ll in &self.leading_lines { + ll.codegen(state); + } + state.add_indent(); + + state.add_token("while"); + self.whitespace_after_while.codegen(state); + self.test.codegen(state); + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + if let Some(orelse) = &self.orelse { + orelse.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for While<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.while_tok).whitespace_before.borrow_mut(), + None, + )?; + self.whitespace_after_while = + parse_simple_whitespace(config, &mut (*self.while_tok).whitespace_after.borrow_mut())?; + self.test = self.test.inflate(config)?; + self.whitespace_before_colon = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + self.body = self.body.inflate(config)?; + self.orelse = self.orelse.inflate(config)?; + + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct ClassDef<'a> { + pub name: Name<'a>, + pub body: Suite<'a>, + pub bases: Vec>, + pub keywords: Vec>, + pub decorators: Vec>, + pub lpar: Option>, + pub rpar: Option>, + pub leading_lines: Vec>, + pub lines_after_decorators: Vec>, + pub whitespace_after_class: SimpleWhitespace<'a>, + pub whitespace_after_name: SimpleWhitespace<'a>, + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) class_tok: TokenRef<'a>, + pub(crate) parens_tok: Option<(TokenRef<'a>, TokenRef<'a>)>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for ClassDef<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for ll in &self.leading_lines { + ll.codegen(state); + } + for dec in &self.decorators { + dec.codegen(state); + } + for lad in &self.lines_after_decorators { + lad.codegen(state); + } + state.add_indent(); + + state.add_token("class"); + self.whitespace_after_class.codegen(state); + self.name.codegen(state); + self.whitespace_after_name.codegen(state); + + let need_parens = !self.bases.is_empty() || !self.keywords.is_empty(); + + if let Some(lpar) = &self.lpar { + lpar.codegen(state); + } else if need_parens { + state.add_token("("); + } + let args = self.bases.iter().chain(self.keywords.iter()); + let len = self.bases.len() + self.keywords.len(); + for (i, arg) in args.enumerate() { + arg.codegen(state, i + 1 < len); + } + + if let Some(rpar) = &self.rpar { + rpar.codegen(state); + } else if need_parens { + state.add_token(")"); + } + + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + } +} + +impl<'a> Inflate<'a> for ClassDef<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.class_tok).whitespace_before.borrow_mut(), + None, + )?; + self.decorators = self.decorators.inflate(config)?; + if let Some(dec) = self.decorators.first_mut() { + swap(&mut self.lines_after_decorators, &mut self.leading_lines); + swap(&mut dec.leading_lines, &mut self.leading_lines); + } + + self.whitespace_after_class = + parse_simple_whitespace(config, &mut (*self.class_tok).whitespace_after.borrow_mut())?; + self.name = self.name.inflate(config)?; + + if let Some((lpar_tok, _)) = self.parens_tok.as_mut() { + self.whitespace_after_name = + parse_simple_whitespace(config, &mut lpar_tok.whitespace_before.borrow_mut())?; + self.lpar = self.lpar.map(|lpar| lpar.inflate(config)).transpose()?; + self.bases = self.bases.inflate(config)?; + self.keywords = self.keywords.inflate(config)?; + self.rpar = self.rpar.map(|lpar| lpar.inflate(config)).transpose()?; + // TODO: set whitespace_after_arg for last arg? + } + + self.whitespace_before_colon = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + self.body = self.body.inflate(config)?; + + Ok(self) + } +} + +impl<'a> ClassDef<'a> { + pub fn with_decorators(self, decorators: Vec>) -> Self { + Self { decorators, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Finally<'a> { + pub body: Suite<'a>, + pub leading_lines: Vec>, + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) finally_tok: TokenRef<'a>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for Finally<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for ll in &self.leading_lines { + ll.codegen(state); + } + state.add_indent(); + + state.add_token("finally"); + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + } +} + +impl<'a> Inflate<'a> for Finally<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.finally_tok).whitespace_before.borrow_mut(), + None, + )?; + self.whitespace_before_colon = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + self.body = self.body.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct ExceptHandler<'a> { + pub body: Suite<'a>, + pub r#type: Option>, + pub name: Option>, + pub leading_lines: Vec>, + pub whitespace_after_except: SimpleWhitespace<'a>, + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) except_tok: TokenRef<'a>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for ExceptHandler<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for ll in &self.leading_lines { + ll.codegen(state); + } + state.add_indent(); + + state.add_token("except"); + self.whitespace_after_except.codegen(state); + if let Some(t) = &self.r#type { + t.codegen(state); + } + if let Some(n) = &self.name { + n.codegen(state); + } + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + } +} + +impl<'a> Inflate<'a> for ExceptHandler<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.except_tok).whitespace_before.borrow_mut(), + None, + )?; + self.whitespace_after_except = parse_simple_whitespace( + config, + &mut (*self.except_tok).whitespace_after.borrow_mut(), + )?; + + self.r#type = self.r#type.inflate(config)?; + self.name = self.name.inflate(config)?; + if self.name.is_some() { + self.whitespace_before_colon = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + } + + self.body = self.body.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Try<'a> { + pub body: Suite<'a>, + pub handlers: Vec>, + pub orelse: Option>, + pub finalbody: Option>, + pub leading_lines: Vec>, + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) try_tok: TokenRef<'a>, + // colon_tok unnecessary +} + +impl<'a> Codegen<'a> for Try<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for ll in &self.leading_lines { + ll.codegen(state); + } + state.add_indent(); + state.add_token("try"); + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + for h in &self.handlers { + h.codegen(state); + } + if let Some(e) = &self.orelse { + e.codegen(state); + } + if let Some(f) = &self.finalbody { + f.codegen(state); + } + } +} + +impl<'a> Inflate<'a> for Try<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.leading_lines = parse_empty_lines( + config, + &mut (*self.try_tok).whitespace_before.borrow_mut(), + None, + )?; + self.whitespace_before_colon = + parse_simple_whitespace(config, &mut (*self.try_tok).whitespace_after.borrow_mut())?; + self.body = self.body.inflate(config)?; + self.handlers = self.handlers.inflate(config)?; + self.orelse = self.orelse.inflate(config)?; + self.finalbody = self.finalbody.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct AugAssign<'a> { + pub target: AssignTargetExpression<'a>, + pub operator: AugOp<'a>, + pub value: Expression<'a>, + pub semicolon: Option>, +} + +impl<'a> Inflate<'a> for AugAssign<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.target = self.target.inflate(config)?; + self.operator = self.operator.inflate(config)?; + self.value = self.value.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for AugAssign<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.target.codegen(state); + self.operator.codegen(state); + self.value.codegen(state); + + if let Some(s) = &self.semicolon { + s.codegen(state); + } + } +} + +impl<'a> AugAssign<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct WithItem<'a> { + pub item: Expression<'a>, + pub asname: Option>, + pub comma: Option>, +} + +impl<'a> Codegen<'a> for WithItem<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.item.codegen(state); + if let Some(n) = &self.asname { + n.codegen(state); + } + if let Some(c) = &self.comma { + c.codegen(state); + } + } +} + +impl<'a> WithComma<'a> for WithItem<'a> { + fn with_comma(self, comma: Comma<'a>) -> Self { + Self { + comma: Some(comma), + ..self + } + } +} + +impl<'a> Inflate<'a> for WithItem<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.item = self.item.inflate(config)?; + self.asname = self.asname.inflate(config)?; + self.comma = self.comma.inflate(config)?; + Ok(self) + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct With<'a> { + pub items: Vec>, + pub body: Suite<'a>, + pub asynchronous: Option>, + pub leading_lines: Vec>, + pub whitespace_after_with: SimpleWhitespace<'a>, + pub whitespace_before_colon: SimpleWhitespace<'a>, + + pub(crate) async_tok: Option>, + pub(crate) with_tok: TokenRef<'a>, + pub(crate) colon_tok: TokenRef<'a>, +} + +impl<'a> Codegen<'a> for With<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + for ll in &self.leading_lines { + ll.codegen(state); + } + state.add_indent(); + + if let Some(asy) = &self.asynchronous { + asy.codegen(state); + } + state.add_token("with"); + self.whitespace_after_with.codegen(state); + let len = self.items.len(); + for (i, item) in self.items.iter().enumerate() { + item.codegen(state); + if item.comma.is_none() && i + 1 < len { + state.add_token(", "); + } + } + self.whitespace_before_colon.codegen(state); + state.add_token(":"); + self.body.codegen(state); + } +} + +impl<'a> Inflate<'a> for With<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + let (asynchronous, leading_lines) = if let Some(asy) = self.async_tok.as_mut() { + let whitespace_after = + parse_parenthesizable_whitespace(config, &mut asy.whitespace_after.borrow_mut())?; + ( + Some(Asynchronous { whitespace_after }), + Some(parse_empty_lines( + config, + &mut asy.whitespace_before.borrow_mut(), + None, + )?), + ) + } else { + (None, None) + }; + + self.asynchronous = asynchronous; + + self.leading_lines = if let Some(ll) = leading_lines { + ll + } else { + parse_empty_lines( + config, + &mut (*self.with_tok).whitespace_before.borrow_mut(), + None, + )? + }; + + self.whitespace_after_with = + parse_simple_whitespace(config, &mut (*self.with_tok).whitespace_after.borrow_mut())?; + self.items = self.items.inflate(config)?; + self.whitespace_before_colon = parse_simple_whitespace( + config, + &mut (*self.colon_tok).whitespace_before.borrow_mut(), + )?; + self.body = self.body.inflate(config)?; + + Ok(self) + } +} + +#[allow(clippy::large_enum_variant)] +#[derive(Debug, PartialEq, Eq, Clone, Codegen, ParenthesizedNode, Inflate, IntoPy)] +pub enum DelTargetExpression<'a> { + Name(Name<'a>), + Attribute(Attribute<'a>), + Tuple(Tuple<'a>), + List(List<'a>), + Subscript(Subscript<'a>), +} + +impl<'a> std::convert::From> for Expression<'a> { + fn from(d: DelTargetExpression<'a>) -> Self { + match d { + DelTargetExpression::Attribute(a) => Expression::Attribute(a), + DelTargetExpression::List(l) => Expression::List(l), + DelTargetExpression::Name(n) => Expression::Name(n), + DelTargetExpression::Subscript(s) => Expression::Subscript(s), + DelTargetExpression::Tuple(t) => Expression::Tuple(t), + } + } +} +impl<'a> std::convert::From> for Element<'a> { + fn from(d: DelTargetExpression<'a>) -> Element { + Element::Simple { + value: d.into(), + comma: None, + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, IntoPy)] +pub struct Del<'a> { + pub target: DelTargetExpression<'a>, + pub whitespace_after_del: SimpleWhitespace<'a>, + pub semicolon: Option>, + + pub(crate) tok: TokenRef<'a>, +} + +impl<'a> Inflate<'a> for Del<'a> { + fn inflate(mut self, config: &Config<'a>) -> Result { + self.whitespace_after_del = + parse_simple_whitespace(config, &mut (*self.tok).whitespace_after.borrow_mut())?; + self.target = self.target.inflate(config)?; + self.semicolon = self.semicolon.inflate(config)?; + Ok(self) + } +} + +impl<'a> Codegen<'a> for Del<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token("del"); + self.whitespace_after_del.codegen(state); + self.target.codegen(state); + if let Some(semi) = &self.semicolon { + semi.codegen(state); + } + } +} + +impl<'a> Del<'a> { + pub fn with_semicolon(self, semicolon: Option>) -> Self { + Self { semicolon, ..self } + } +} diff --git a/native/libcst/src/nodes/test_utils.rs b/native/libcst/src/nodes/test_utils.rs new file mode 100644 index 000000000..6a462c8df --- /dev/null +++ b/native/libcst/src/nodes/test_utils.rs @@ -0,0 +1,42 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use pyo3::prelude::*; + +py_import!("libcst._nodes.deep_equals", "deep_equals", get_deep_equals); + +pub fn repr_or_panic(py: Python, value: T) -> String +where + T: ToPyObject, +{ + value + .to_object(py) + .as_ref(py) + .repr() + .expect("failed to call repr") + .extract() + .expect("repr should've returned str") +} + +pub fn py_assert_deep_equals(py: Python, left: L, right: R) +where + L: ToPyObject, + R: ToPyObject, +{ + let (left, right) = (left.to_object(py), right.to_object(py)); + let equals = get_deep_equals(py) + .expect("failed to import deep_equals") + .call1((&left, &right)) + .expect("failed to call deep_equals") + .extract::() + .expect("deep_equals should return a bool"); + if !equals { + panic!( + "assertion failed: {} was not deeply equal to {}", + repr_or_panic(py, &left), + repr_or_panic(py, &right), + ); + } +} diff --git a/native/libcst/src/nodes/traits.rs b/native/libcst/src/nodes/traits.rs new file mode 100644 index 000000000..155ae17aa --- /dev/null +++ b/native/libcst/src/nodes/traits.rs @@ -0,0 +1,62 @@ +use crate::{ + tokenizer::whitespace_parser::{Config, WhitespaceError}, + Codegen, CodegenState, Comma, EmptyLine, LeftParen, RightParen, +}; + +pub trait WithComma<'a> { + fn with_comma(self, comma: Comma<'a>) -> Self; +} + +pub trait ParenthesizedNode<'a> { + fn lpar(&self) -> &Vec>; + fn rpar(&self) -> &Vec>; + + fn parenthesize(&self, state: &mut CodegenState<'a>, f: F) + where + F: FnOnce(&mut CodegenState<'a>), + { + for lpar in self.lpar() { + lpar.codegen(state); + } + f(state); + for rpar in self.rpar() { + rpar.codegen(state); + } + } + + fn with_parens(self, left: LeftParen<'a>, right: RightParen<'a>) -> Self; +} + +pub trait WithLeadingLines<'a> { + fn leading_lines(&mut self) -> &mut Vec>; +} + +pub type Result = std::result::Result; + +pub trait Inflate<'a> +where + Self: Sized, +{ + fn inflate(self, config: &Config<'a>) -> Result; +} + +impl<'a, T: Inflate<'a>> Inflate<'a> for Option { + fn inflate(self, config: &Config<'a>) -> Result { + self.map(|x| x.inflate(config)).transpose() + } +} + +impl<'a, T: Inflate<'a> + ?Sized> Inflate<'a> for Box { + fn inflate(self, config: &Config<'a>) -> Result { + match (*self).inflate(config) { + Ok(a) => Ok(Box::new(a)), + Err(e) => Err(e), + } + } +} + +impl<'a, T: Inflate<'a>> Inflate<'a> for Vec { + fn inflate(self, config: &Config<'a>) -> Result { + self.into_iter().map(|item| item.inflate(config)).collect() + } +} diff --git a/native/libcst/src/nodes/whitespace.rs b/native/libcst/src/nodes/whitespace.rs new file mode 100644 index 000000000..89f2800e2 --- /dev/null +++ b/native/libcst/src/nodes/whitespace.rs @@ -0,0 +1,167 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use libcst_derive::IntoPy; + +use super::{Codegen, CodegenState}; + +#[derive(Debug, Eq, PartialEq, Default, Clone, IntoPy)] +pub struct SimpleWhitespace<'a>(pub &'a str); + +impl<'a> Codegen<'a> for SimpleWhitespace<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token(self.0); + } +} + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub struct Comment<'a>(pub &'a str); + +impl<'a> Default for Comment<'a> { + fn default() -> Self { + Self("#") + } +} + +impl<'a> Codegen<'a> for Comment<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + state.add_token(self.0); + } +} + +#[derive(Debug, Eq, PartialEq, Default, Clone, IntoPy)] +pub struct Newline<'a>(pub Option<&'a str>, pub Fakeness); + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum Fakeness { + Fake, + Real, +} + +impl Default for Fakeness { + fn default() -> Self { + Self::Real + } +} + +impl<'a> Codegen<'a> for Newline<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + if let Fakeness::Fake = self.1 { + return; + } + if let Some(value) = self.0 { + state.add_token(value); + } else { + state.add_token(state.default_newline); + } + } +} + +#[derive(Debug, Eq, PartialEq, Default, Clone, IntoPy)] +pub struct TrailingWhitespace<'a> { + pub whitespace: SimpleWhitespace<'a>, + pub comment: Option>, + pub newline: Newline<'a>, +} + +impl<'a> Codegen<'a> for TrailingWhitespace<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.whitespace.codegen(state); + if let Some(comment) = &self.comment { + comment.codegen(state); + } + self.newline.codegen(state); + } +} + +#[derive(Debug, Clone, PartialEq, Eq, IntoPy)] +pub struct EmptyLine<'a> { + pub indent: bool, + pub whitespace: SimpleWhitespace<'a>, + pub comment: Option>, + pub newline: Newline<'a>, +} + +impl<'a> Codegen<'a> for EmptyLine<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + if self.indent { + state.add_indent() + } + self.whitespace.codegen(state); + if let Some(comment) = &self.comment { + comment.codegen(state); + } + self.newline.codegen(state); + } +} + +impl<'a> Default for EmptyLine<'a> { + fn default() -> Self { + Self { + indent: true, + whitespace: Default::default(), + comment: Default::default(), + newline: Default::default(), + } + } +} + +impl<'a> EmptyLine<'a> { + pub fn new( + indent: bool, + whitespace: SimpleWhitespace<'a>, + comment: Option>, + newline: Newline<'a>, + ) -> Self { + Self { + indent, + whitespace, + comment, + newline, + } + } +} + +#[derive(Debug, Eq, PartialEq, Default, Clone, IntoPy)] +pub struct ParenthesizedWhitespace<'a> { + pub first_line: TrailingWhitespace<'a>, + pub empty_lines: Vec>, + pub indent: bool, + pub last_line: SimpleWhitespace<'a>, +} + +impl<'a> Codegen<'a> for ParenthesizedWhitespace<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + self.first_line.codegen(state); + for line in &self.empty_lines { + line.codegen(state); + } + if self.indent { + state.add_indent() + } + self.last_line.codegen(state); + } +} + +#[derive(Debug, Eq, PartialEq, Clone, IntoPy)] +pub enum ParenthesizableWhitespace<'a> { + SimpleWhitespace(SimpleWhitespace<'a>), + ParenthesizedWhitespace(ParenthesizedWhitespace<'a>), +} + +impl<'a> Codegen<'a> for ParenthesizableWhitespace<'a> { + fn codegen(&self, state: &mut CodegenState<'a>) { + match self { + Self::SimpleWhitespace(w) => w.codegen(state), + Self::ParenthesizedWhitespace(w) => w.codegen(state), + } + } +} + +impl<'a> Default for ParenthesizableWhitespace<'a> { + fn default() -> Self { + Self::SimpleWhitespace(SimpleWhitespace("")) + } +} diff --git a/native/libcst/src/parser/errors.rs b/native/libcst/src/parser/errors.rs new file mode 100644 index 000000000..641d96309 --- /dev/null +++ b/native/libcst/src/parser/errors.rs @@ -0,0 +1,78 @@ +use pyo3::types::{IntoPyDict, PyModule}; +use pyo3::{IntoPy, PyErr, PyErrArguments, Python}; + +use crate::parser::grammar::TokVec; +use crate::tokenizer::whitespace_parser::WhitespaceError; +use crate::tokenizer::TokError; +use peg::Parse; +use thiserror::Error; + +#[allow(clippy::enum_variant_names)] +#[derive(Debug, Error, PartialEq, Eq)] +pub enum ParserError<'a> { + #[error("tokenizer error: {0}")] + TokenizerError(TokError<'a>, &'a str), + #[error("parser error: {0}")] + ParserError( + peg::error::ParseError< as Parse>::PositionRepr>, + &'a str, + ), + #[error(transparent)] + WhitespaceError(#[from] WhitespaceError), + #[error("invalid operator")] + OperatorError, +} + +impl<'a> From> for PyErr { + fn from(e: ParserError) -> Self { + Python::with_gil(|py| { + let lines = match &e { + ParserError::TokenizerError(_, text) | ParserError::ParserError(_, text) => { + text.lines().collect::>() + } + _ => vec![""], + }; + let (line, col) = match &e { + ParserError::ParserError(err, ..) => { + (err.location.start_pos.line, err.location.start_pos.column) + } + _ => (0, 0), + }; + let kwargs = [ + ("message", e.to_string().into_py(py)), + ("lines", lines.into_py(py)), + ("raw_line", line.into_py(py)), + ("raw_column", col.into_py(py)), + ] + .into_py_dict(py); + let libcst = PyModule::import(py, "libcst").expect("libcst cannot be imported"); + PyErr::from_instance( + libcst + .getattr("ParserSyntaxError") + .expect("ParserSyntaxError not found") + .call((), Some(kwargs)) + .expect("failed to instantiate"), + ) + }) + } +} + +struct Details { + message: String, + lines: Vec, + raw_line: u32, + raw_column: u32, +} + +impl<'a> PyErrArguments for Details { + fn arguments(self, py: pyo3::Python) -> pyo3::PyObject { + [ + ("message", self.message.into_py(py)), + ("lines", self.lines.into_py(py)), + ("raw_line", self.raw_line.into_py(py)), + ("raw_column", self.raw_column.into_py(py)), + ] + .into_py_dict(py) + .into_py(py) + } +} diff --git a/native/libcst/src/parser/grammar.rs b/native/libcst/src/parser/grammar.rs new file mode 100644 index 000000000..f79594f21 --- /dev/null +++ b/native/libcst/src/parser/grammar.rs @@ -0,0 +1,2993 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::rc::Rc; + +use crate::nodes::*; +use crate::parser::ParserError; +use crate::tokenizer::{TokType, Token}; +use peg::str::LineCol; +use peg::{parser, Parse, ParseElem, RuleResult}; +use TokType::{ + Async, Await as AWAIT, Dedent, EndMarker, FStringEnd, FStringStart, FStringString, Indent, + Name as NameTok, Newline as NL, Number, String as STRING, +}; + +pub type Result<'a, T> = std::result::Result>; + +#[derive(Debug)] +pub struct TokVec<'a>(Vec>>); + +impl<'a> std::convert::From>> for TokVec<'a> { + fn from(vec: Vec>) -> Self { + TokVec(vec.into_iter().map(Rc::new).collect()) + } +} + +#[derive(Debug, PartialEq, Eq)] +pub struct ParseLoc { + pub start_pos: LineCol, + pub end_pos: LineCol, +} + +impl std::fmt::Display for ParseLoc { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.start_pos.fmt(f) + } +} + +impl<'a> Parse for TokVec<'a> { + type PositionRepr = ParseLoc; + + fn start(&self) -> usize { + 0 + } + + fn is_eof(&self, pos: usize) -> bool { + pos >= self.0.len() + } + + fn position_repr(&self, pos: usize) -> Self::PositionRepr { + let tok = self.0.get(pos).unwrap_or_else(|| self.0.last().unwrap()); + ParseLoc { + start_pos: LineCol { + line: tok.start_pos.line_number(), + column: tok.start_pos.char_column_number(), + offset: tok.start_pos.byte_idx(), + }, + end_pos: LineCol { + line: tok.end_pos.line_number(), + column: tok.end_pos.char_column_number(), + offset: tok.end_pos.byte_idx(), + }, + } + } +} + +type TokenRef<'a> = Rc>; + +impl<'a> ParseElem for TokVec<'a> { + type Element = TokenRef<'a>; + + fn parse_elem(&self, pos: usize) -> RuleResult { + match self.0.get(pos) { + Some(tok) => RuleResult::Matched(pos + 1, tok.clone()), + None => RuleResult::Failed, + } + } +} + +parser! { + pub grammar python<'a>(input: &'a str) for TokVec<'a> { + + // Starting Rules + + pub rule file(encoding: Option<&str>) -> Module<'a> + = traced(<_file(encoding.unwrap_or("utf-8"))>) + + pub rule expression_input() -> Expression<'a> + = traced() + + pub rule statement_input() -> Statement<'a> + = traced() + + rule _file(encoding: &str) -> Module<'a> + = s:statements()? eof:tok(EndMarker, "EOF") { + make_module(s.unwrap_or_default(), eof, encoding) + } + + // General statements + + rule statements() -> Vec> + = statement()+ + + rule statement() -> Statement<'a> + = c:compound_stmt() { Statement::Compound(c) } + / s:simple_stmts() { + Statement::Simple(make_simple_statement_line(s)) + } + + rule simple_stmts() -> SimpleStatementParts<'a> + = first_tok:&_ stmts:separated_trailer(, ) nl:tok(NL, "NEWLINE") { + SimpleStatementParts { + first_tok, + first_statement: stmts.0, + rest: stmts.1, + last_semi: stmts.2, + nl, + } + } + + #[cache] + rule simple_stmt() -> SmallStatement<'a> + = assignment() + / e:star_expressions() { SmallStatement::Expr(Expr { value: e, semicolon: None }) } + / &lit("return") s:return_stmt() { SmallStatement::Return(s) } + // this is expanded from the original grammar's import_stmt rule + / &lit("import") i:import_name() { SmallStatement::Import(i) } + / &lit("from") i:import_from() { SmallStatement::ImportFrom(i) } + / &lit("raise") r:raise_stmt() { SmallStatement::Raise(r) } + / lit("pass") { SmallStatement::Pass(Pass { semicolon: None }) } + / &lit("del") s:del_stmt() { SmallStatement::Del(s) } + / &lit("yield") s:yield_stmt() { SmallStatement::Expr(Expr { value: s, semicolon: None }) } + / &lit("assert") s:assert_stmt() {SmallStatement::Assert(s)} + / lit("break") { SmallStatement::Break(Break { semicolon: None })} + / lit("continue") { SmallStatement::Continue(Continue { semicolon: None })} + / &lit("global") s:global_stmt() {SmallStatement::Global(s)} + / &lit("nonlocal") s:nonlocal_stmt() {SmallStatement::Nonlocal(s)} + + + rule compound_stmt() -> CompoundStatement<'a> + = &(lit("def") / lit("@") / tok(Async, "ASYNC")) f:function_def() { + CompoundStatement::FunctionDef(f) + } + / &lit("if") f:if_stmt() { CompoundStatement::If(f) } + / &(lit("class") / lit("@")) c:class_def() { CompoundStatement::ClassDef(c) } + / &(lit("with") / tok(Async, "ASYNC")) w:with_stmt() { CompoundStatement::With(w) } + / &(lit("for") / tok(Async, "ASYNC")) f:for_stmt() { CompoundStatement::For(f) } + / &lit("try") t:try_stmt() { CompoundStatement::Try(t) } + / &lit("while") w:while_stmt() { CompoundStatement::While(w) } + + // Simple statements + + rule assignment() -> SmallStatement<'a> + = a:name() col:lit(":") ann:expression() + rhs:(eq:lit("=") d:annotated_rhs() {(eq, d)})? { + SmallStatement::AnnAssign( + make_ann_assignment(AssignTargetExpression::Name(a), col, ann, rhs)) + } + // TODO: there's an extra '(' single_target ')' clause here in upstream + / a:single_subscript_attribute_target() col:lit(":") ann:expression() + rhs:(eq:lit("=") d:annotated_rhs() {(eq, d)})? { + SmallStatement::AnnAssign(make_ann_assignment(a, col, ann, rhs)) + } + / lhs:(t:star_targets() eq:lit("=") {(t, eq)})+ rhs:(yield_expr() / star_expressions()) !lit("=") { + SmallStatement::Assign(make_assignment(lhs, rhs)) + } + / t:single_target() op:augassign() rhs:(yield_expr() / star_expressions()) { + SmallStatement::AugAssign(make_aug_assign(t, op, rhs)) + } + + rule annotated_rhs() -> Expression<'a> + = yield_expr() / star_expressions() + + rule augassign() -> AugOp<'a> + = &(lit("+=") + / lit("-=") + / lit("*=") + / lit("@=") + / lit("/=") + / lit("%=") + / lit("&=") + / lit("|=") + / lit("^=") + / lit("<<=") + / lit(">>=") + / lit("**=") + / lit("//=")) tok:_ {? + make_aug_op(tok).map_err(|_| "aug_op") + } + + rule return_stmt() -> Return<'a> + = kw:lit("return") a:star_expressions()? { + make_return(kw, a) + } + + rule raise_stmt() -> Raise<'a> + = kw:lit("raise") exc:expression() + rest:(f:lit("from") cau:expression() {(f, cau)})? { + make_raise(kw, Some(exc), rest) + } + / kw:lit("raise") { + make_raise(kw, None, None) + } + + rule global_stmt() -> Global<'a> + = kw:lit("global") init:(n:name() c:comma() {(n, c)})* last:name() { + make_global(kw, init, last) + } + + rule nonlocal_stmt() -> Nonlocal<'a> + = kw:lit("nonlocal") init:(n:name() c:comma() {(n, c)})* last:name() { + make_nonlocal(kw, init, last) + } + + rule del_stmt() -> Del<'a> + = kw:lit("del") t:del_target() &(lit(";") / tok(NL, "NEWLINE")) { + make_del(kw, t) + } + / kw:lit("del") t:del_targets() &(lit(";") / tok(NL, "NEWLINE")) { + make_del(kw, make_del_tuple(None, t, None)) + } + + rule yield_stmt() -> Expression<'a> + = yield_expr() + + rule assert_stmt() -> Assert<'a> + = kw:lit("assert") test:expression() rest:(c:comma() msg:expression() {(c, msg)})? { + make_assert(kw, test, rest) + } + + // Import statements + + rule import_name() -> Import<'a> + = kw:lit("import") a:dotted_as_names() { + make_import(kw, a) + } + + rule import_from() -> ImportFrom<'a> + = from:lit("from") dots:dots()? m:dotted_name() + import:lit("import") als:import_from_targets() { + make_import_from(from, dots.unwrap_or_default(), Some(m), import, als) + } + / from:lit("from") dots:dots() + import:lit("import") als:import_from_targets() { + make_import_from(from, dots, None, import, als) + } + + rule import_from_targets() -> ParenthesizedImportNames<'a> + = lpar:lpar() als:import_from_as_names() c:comma()? rpar:rpar() { + let mut als = als; + if let (comma@Some(_), Some(mut last)) = (c, als.last_mut()) { + last.comma = comma; + } + (Some(lpar), ImportNames::Aliases(als), Some(rpar)) + } + / als:import_from_as_names() !lit(",") { (None, ImportNames::Aliases(als), None)} + / star:lit("*") { (None, ImportNames::Star(ImportStar {}), None) } + + rule import_from_as_names() -> Vec> + = items:separated(, ) { + make_import_from_as_names(items.0, items.1) + } + + rule import_from_as_name() -> ImportAlias<'a> + = n:name() asname:(kw:lit("as") z:name() {(kw, z)})? { + make_import_alias(NameOrAttribute::N(n), asname) + } + + rule dotted_as_names() -> Vec> + = init:(d:dotted_as_name() c:comma() {d.with_comma(c)})* + last:dotted_as_name() { + concat(init, vec![last]) + } + + rule dotted_as_name() -> ImportAlias<'a> + = n:dotted_name() asname:(kw:lit("as") z:name() {(kw, z)})? { + make_import_alias(n, asname) + } + + // TODO: why does this diverge from CPython? + rule dotted_name() -> NameOrAttribute<'a> + = first:name() tail:(dot:lit(".") n:name() {(dot, n)})* { + make_name_or_attr(first, tail) + } + + // Compound statements + + // Common elements + + #[cache] + rule block() -> Suite<'a> + = n:tok(NL, "NEWLINE") ind:tok(Indent, "INDENT") s:statements() ded:tok(Dedent, "DEDENT") { + make_indented_block(n, ind, s, ded) + } + / s:simple_stmts() { + make_simple_statement_suite(s) + } + + rule decorators() -> Vec> + = (at:lit("@") e:named_expression() nl:tok(NL, "NEWLINE") { + make_decorator(at, e, nl) + } )+ + + // Class definitions + + rule class_def() -> ClassDef<'a> + = d:decorators() c:class_def_raw() { c.with_decorators(d) } + / class_def_raw() + + rule class_def_raw() -> ClassDef<'a> + = kw:lit("class") n:name() arg:(l:lpar() a:arguments()? r:rpar() {(l, a, r)})? + col:lit(":") b:block() {? + make_class_def(kw, n, arg, col, b) + } + + // Function definitions + + rule function_def() -> FunctionDef<'a> + = d:decorators() f:function_def_raw() {f.with_decorators(d)} + / function_def_raw() + + rule _returns() -> Annotation<'a> + = l:lit("->") e:expression() { + make_annotation(l, e) + } + + rule function_def_raw() -> FunctionDef<'a> + = def:lit("def") n:name() op:lit("(") params:params()? + cp:lit(")") ty:_returns()? c:lit(":") b:block() { + make_function_def(None, def, n, op, params, cp, ty, c, b) + } + / asy:tok(Async, "ASYNC") def:lit("def") n:name() op:lit("(") params:params()? + cp:lit(")") ty:_returns()? c:lit(":") b:block() { + make_function_def(Some(asy), def, n, op, params, cp, ty, c, b) + } + + // Function parameters + + rule params() -> Parameters<'a> + = parameters() + + rule parameters() -> Parameters<'a> + = a:slash_no_default() b:param_no_default()* c:param_with_default()* d:star_etc()? { + make_parameters(Some(a), concat(b, c), d) + } + / a:slash_with_default() b:param_with_default()* d:star_etc()? { + make_parameters(Some(a), b, d) + } + / a:param_no_default()+ b:param_with_default()* d:star_etc()? { + make_parameters(None, concat(a, b), d) + } + / a:param_with_default()+ d:star_etc()? { + make_parameters(None, a, d) + } + / d:star_etc() { + make_parameters(None, vec![], Some(d)) + } + + rule slash_no_default() -> (Vec>, ParamSlash<'a>) + = a:param_no_default()+ slash:lit("/") com:comma() { + (a, ParamSlash { comma: Some(com)}) + } + / a:param_no_default()+ slash:lit("/") &lit(")") { + (a, ParamSlash { comma: None }) + } + + rule slash_with_default() -> (Vec>, ParamSlash<'a>) + = a:param_no_default()* b:param_with_default()+ slash:lit("/") c:comma() { + (concat(a, b), ParamSlash { comma: Some(c) }) + } + / a:param_no_default()* b:param_with_default()+ slash:lit("/") &lit(")") { + (concat(a, b), ParamSlash { comma: None }) + } + + rule star_etc() -> StarEtc<'a> + = star:lit("*") a:param_no_default() b:param_maybe_default()* kw:kwds()? { + StarEtc(Some(StarArg::Param(Box::new( + add_param_star(a, star)))), b, kw) + } + / lit("*") c:comma() b:param_maybe_default()+ kw:kwds()? { + StarEtc(Some(StarArg::Star(ParamStar {comma:c })), b, kw) + } + / kw:kwds() { StarEtc(None, vec![], Some(kw)) } + + rule kwds() -> Param<'a> + = star:lit("**") a:param_no_default() { + add_param_star(a, star) + } + + rule param_no_default() -> Param<'a> + = a:param() c:lit(",") { add_param_default(a, None, Some(c)) } + / a:param() &lit(")") {a} + + rule param_with_default() -> Param<'a> + = a:param() def:default() c:lit(",") { + add_param_default(a, Some(def), Some(c)) + } + / a:param() def:default() &lit(")") { + add_param_default(a, Some(def), None) + } + + rule param_maybe_default() -> Param<'a> + = a:param() def:default()? c:lit(",") { + add_param_default(a, def, Some(c)) + } + / a:param() def:default()? &lit(")") { + add_param_default(a, def, None) + } + + rule param() -> Param<'a> + = n:name() a:annotation()? { + Param {name: n, annotation: a, ..Default::default() } + } + + rule annotation() -> Annotation<'a> + = col:lit(":") e:expression() { + make_annotation(col, e) + } + + rule default() -> (AssignEqual<'a>, Expression<'a>) + = eq:lit("=") ex:expression() { + (make_assign_equal(eq), ex) + } + + // If statement + + rule if_stmt() -> If<'a> + = i:lit("if") a:named_expression() col:lit(":") b:block() elif:elif_stmt() { + make_if(i, a, col, b, Some(OrElse::Elif(elif)), false) + } + / i:lit("if") a:named_expression() col:lit(":") b:block() el:else_block()? { + make_if(i, a, col, b, el.map(OrElse::Else), false) + } + + rule elif_stmt() -> If<'a> + = i:lit("elif") a:named_expression() col:lit(":") b:block() elif:elif_stmt() { + make_if(i, a, col, b, Some(OrElse::Elif(elif)), true) + } + / i:lit("elif") a:named_expression() col:lit(":") b:block() el:else_block()? { + make_if(i, a, col, b, el.map(OrElse::Else), true) + } + + rule else_block() -> Else<'a> + = el:lit("else") col:lit(":") b:block() { + make_else(el, col, b) + } + + // While statement + + rule while_stmt() -> While<'a> + = kw:lit("while") test:named_expression() col:lit(":") b:block() el:else_block()? { + make_while(kw, test, col, b, el) + } + + // For statement + + rule for_stmt() -> For<'a> + = f:lit("for") t:star_targets() i:lit("in") it:star_expressions() + c:lit(":") b:block() el:else_block()? { + make_for(None, f, t, i, it, c, b, el) + } + / asy:tok(Async, "ASYNC") f:lit("for") t:star_targets() i:lit("in") + it:star_expressions() + c:lit(":") b:block() el:else_block()? { + make_for(Some(asy), f, t, i, it, c, b, el) + } + + // With statement + + rule with_stmt() -> With<'a> + = kw:lit("with") items:separated(, ) + col:lit(":") b:block() { + make_with(None, kw, comma_separate(items.0, items.1, None), col, b) + } + / asy:tok(Async, "ASYNC") kw:lit("with") items:separated(, ) + col:lit(":") b:block() { + make_with(Some(asy), kw, comma_separate(items.0, items.1, None), col, b) + } + + rule with_item() -> WithItem<'a> + = e:expression() a:lit("as") t:star_target() &(lit(",") / lit(":")) { + make_with_item(e, Some(a), Some(t)) + } + / e:expression() { + make_with_item(e, None, None) + } + + // Try statement + + rule try_stmt() -> Try<'a> + = kw:lit("try") lit(":") b:block() f:finally_block() { + make_try(kw, b, vec![], None, Some(f)) + } + / kw:lit("try") lit(":") b:block() ex:except_block()+ el:else_block()? + f:finally_block()? { + make_try(kw, b, ex, el, f) + } + + // Except statement + + rule except_block() -> ExceptHandler<'a> + = kw:lit("except") e:expression() a:(k:lit("as") n:name() {(k, n)})? + col:lit(":") b:block() { + make_except(kw, Some(e), a, col, b) + } + / kw:lit("except") col:lit(":") b:block() { + make_except(kw, None, None, col, b) + } + + rule finally_block() -> Finally<'a> + = kw:lit("finally") col:lit(":") b:block() { + make_finally(kw, col, b) + } + + + // Expressions + + #[cache] + rule expression() -> Expression<'a> + = _conditional_expression() + / lambdef() + + rule _conditional_expression() -> Expression<'a> + = body:disjunction() i:lit("if") test:disjunction() e:lit("else") oe:expression() { + Expression::IfExp(make_ifexp(body, i, test, e, oe)) + } + / disjunction() + + rule yield_expr() -> Expression<'a> + = y:lit("yield") f:lit("from") a:expression() { + Expression::Yield(make_yield(y, Some(f), Some(a))) + } + / y:lit("yield") a:star_expressions()? { + Expression::Yield(make_yield(y, None, a)) + } + + rule star_expressions() -> Expression<'a> + = first:star_expression() + rest:(comma:comma() e:star_expression() { (comma, expr_to_element(e)) })+ + comma:comma()? { + Expression::Tuple(make_tuple(expr_to_element(first), rest, comma, None, None)) + } + / e:star_expression() comma:comma() { + Expression::Tuple(make_tuple(expr_to_element(e), vec![], Some(comma), None, None)) + } + / star_expression() + + #[cache] + rule star_expression() -> Expression<'a> + = star:lit("*") e:bitwise_or() { + Expression::StarredElement(make_starred_element(star, expr_to_element(e))) + } + / expression() + + rule star_named_expressions() -> Vec> + = exps:separated_trailer(, ) { + comma_separate(exps.0, exps.1, exps.2) + } + + rule star_named_expression() -> Element<'a> + = star:lit("*") e:bitwise_or() { + Element::Starred(make_starred_element(star, expr_to_element(e))) + } + / e:named_expression() { expr_to_element(e) } + + rule named_expression() -> Expression<'a> + = a:name() op:lit(":=") b:expression() { + Expression::NamedExpr(make_named_expr(a, op, b)) + } + / e:expression() !lit(":=") { e } + + #[cache] + rule disjunction() -> Expression<'a> + = a:conjunction() b:(or:lit("or") inner:conjunction() { (or, inner) })+ {? + make_boolean_op(a, b).map_err(|e| "expected disjunction") + } + / conjunction() + + #[cache] + rule conjunction() -> Expression<'a> + = a:inversion() b:(and:lit("and") inner:inversion() { (and, inner) })+ {? + make_boolean_op(a, b).map_err(|e| "expected conjunction") + } + / inversion() + + #[cache] + rule inversion() -> Expression<'a> + = not:lit("not") a:inversion() {? + make_unary_op(not, a).map_err(|e| "expected inversion") + } + / comparison() + + // Comparison operators + + #[cache] + rule comparison() -> Expression<'a> + = a:bitwise_or() b:compare_op_bitwise_or_pair()+ { make_comparison(a, b) } + / bitwise_or() + + // This implementation diverges slightly from CPython (3.9) to avoid bloating + // the parser cache and increase readability. + #[cache] + rule compare_op_bitwise_or_pair() -> (CompOp<'a>, Expression<'a>) + = _op_bitwise_or("==") + / _op_bitwise_or("!=") // TODO: support barry_as_flufl + / _op_bitwise_or("<=") + / _op_bitwise_or("<") + / _op_bitwise_or(">=") + / _op_bitwise_or(">") + / _op_bitwise_or2("not", "in") + / _op_bitwise_or("in") + / _op_bitwise_or2("is", "not") + / _op_bitwise_or("is") + + rule _op_bitwise_or(o: &'static str) -> (CompOp<'a>, Expression<'a>) + = op:lit(o) e:bitwise_or() {? + make_comparison_operator(op) + .map(|op| (op, e)) + .map_err(|_| "comparison") + } + + rule _op_bitwise_or2(first: &'static str, second: &'static str) -> (CompOp<'a>, Expression<'a>) + = f:lit(first) s:lit(second) e:bitwise_or() {? + make_comparison_operator_2(f, s) + .map(|op| (op, e)) + .map_err(|_| "comparison") + } + + #[cache_left_rec] + rule bitwise_or() -> Expression<'a> + = a:bitwise_or() op:lit("|") b:bitwise_xor() {? + make_binary_op(a, op, b).map_err(|e| "expected bitwise_or") + } + / bitwise_xor() + + #[cache_left_rec] + rule bitwise_xor() -> Expression<'a> + = a:bitwise_xor() op:lit("^") b:bitwise_and() {? + make_binary_op(a, op, b).map_err(|e| "expected bitwise_xor") + } + / bitwise_and() + + #[cache_left_rec] + rule bitwise_and() -> Expression<'a> + = a:bitwise_and() op:lit("&") b:shift_expr() {? + make_binary_op(a, op, b).map_err(|e| "expected bitwise_and") + } + / shift_expr() + + #[cache_left_rec] + rule shift_expr() -> Expression<'a> + = a:shift_expr() op:lit("<<") b:sum() {? + make_binary_op(a, op, b).map_err(|e| "expected shift_expr") + } + / a:shift_expr() op:lit(">>") b:sum() {? + make_binary_op(a, op, b).map_err(|e| "expected shift_expr") + } + / sum() + + #[cache_left_rec] + rule sum() -> Expression<'a> + = a:sum() op:lit("+") b:term() {? + make_binary_op(a, op, b).map_err(|e| "expected sum") + } + / a:sum() op:lit("-") b:term() {? + make_binary_op(a, op, b).map_err(|e| "expected sum") + } + / term() + + #[cache_left_rec] + rule term() -> Expression<'a> + = a:term() op:lit("*") b:factor() {? + make_binary_op(a, op, b).map_err(|e| "expected term") + } + / a:term() op:lit("/") b:factor() {? + make_binary_op(a, op, b).map_err(|e| "expected term") + } + / a:term() op:lit("//") b:factor() {? + make_binary_op(a, op, b).map_err(|e| "expected term") + } + / a:term() op:lit("%") b:factor() {? + make_binary_op(a, op, b).map_err(|e| "expected term") + } + / a:term() op:lit("@") b:factor() {? + make_binary_op(a, op, b).map_err(|e| "expected term") + } + / factor() + + #[cache] + rule factor() -> Expression<'a> + = op:lit("+") a:factor() {? + make_unary_op(op, a).map_err(|e| "expected factor") + } + / op:lit("-") a:factor() {? + make_unary_op(op, a).map_err(|e| "expected factor") + } + / op:lit("~") a:factor() {? + make_unary_op(op, a).map_err(|e| "expected factor") + } + / power() + + rule power() -> Expression<'a> + = a:await_primary() op:lit("**") b:factor() {? + make_binary_op(a, op, b).map_err(|e| "expected power") + } + / await_primary() + + // Primary elements + + rule await_primary() -> Expression<'a> + = aw:tok(AWAIT, "AWAIT") e:primary() { + Expression::Await(make_await(aw, e)) + } + / primary() + + #[cache_left_rec] + rule primary() -> Expression<'a> + = v:primary() dot:lit(".") attr:name() { + Expression::Attribute(make_attribute(v, dot, attr)) + } + / a:primary() b:genexp() { + Expression::Call(make_genexp_call(a, b)) + } + / f:primary() lpar:lit("(") arg:arguments()? rpar:lit(")") { + Expression::Call(make_call(f, lpar, arg.unwrap_or_default(), rpar)) + } + / v:primary() lbrak:lbrak() s:slices() rbrak:rbrak() { + Expression::Subscript(make_subscript(v, lbrak, s, rbrak)) + } + / atom() + + rule slices() -> Vec> + = s:slice() !lit(",") { vec![SubscriptElement { slice: s, comma: None }] } + / slices:separated_trailer(, ) { + make_slices(slices.0, slices.1, slices.2) + } + + rule slice() -> BaseSlice<'a> + = l:expression()? col:lit(":") u:expression()? + rest:(c:lit(":") s:expression()? {(c, s)})? { + make_slice(l, col, u, rest) + } + / v:expression() { make_index(v) } + + rule atom() -> Expression<'a> + = n:name() { Expression::Name(n) } + / n:lit("True") { Expression::Name(make_name(n)) } + / n:lit("False") { Expression::Name(make_name(n)) } + / n:lit("None") { Expression::Name(make_name(n)) } + / &(tok(STRING, "") / tok(FStringStart, "")) s:strings() {s.into()} + / n:tok(Number, "NUMBER") { make_number(n) } + / &lit("(") e:(tuple() / group() / (g:genexp() {Expression::GeneratorExp(g)})) {e} + / &lit("[") e:(list() / listcomp()) {e} + / &lit("{") e:(dict() / set() / dictcomp() / setcomp()) {e} + / lit("...") { Expression::Ellipsis(Ellipsis {lpar: vec![], rpar: vec![]})} + + rule group() -> Expression<'a> + = lpar:lpar() e:(yield_expr() / named_expression()) rpar:rpar() { + e.with_parens(lpar, rpar) + } + + // Lambda functions + + rule lambdef() -> Expression<'a> + = kw:lit("lambda") p:lambda_params()? c:lit(":") b:expression() { + Expression::Lambda(make_lambda(kw, p.unwrap_or_default(), c, b)) + } + + rule lambda_params() -> Parameters<'a> + = lambda_parameters() + + // lambda_parameters etc. duplicates parameters but without annotations or type + // comments, and if there's no comma after a parameter, we expect a colon, not a + // close parenthesis. + + rule lambda_parameters() -> Parameters<'a> + = a:lambda_slash_no_default() b:lambda_param_no_default()* + c:lambda_param_with_default()* d:lambda_star_etc()? { + make_parameters(Some(a), concat(b, c), d) + } + / a:lambda_slash_with_default() b:lambda_param_with_default()* + d:lambda_star_etc()? { + make_parameters(Some(a), b, d) + } + / a:lambda_param_no_default()+ b:lambda_param_with_default()* + d:lambda_star_etc()? { + make_parameters(None, concat(a, b), d) + } + / a:lambda_param_with_default()+ d:lambda_star_etc()? { + make_parameters(None, a, d) + } + / d:lambda_star_etc() { + make_parameters(None, vec![], Some(d)) + } + + rule lambda_slash_no_default() -> (Vec>, ParamSlash<'a>) + = a:lambda_param_no_default()+ slash:lit("/") com:comma() { + (a, ParamSlash { comma: Some(com) } ) + } + / a:lambda_param_no_default()+ slash:lit("/") &lit(":") { + (a, ParamSlash { comma: None }) + } + + rule lambda_slash_with_default() -> (Vec>, ParamSlash<'a>) + = a:lambda_param_no_default()* b:lambda_param_with_default()+ slash:lit("/") c:comma(){ + (concat(a, b), ParamSlash { comma: Some(c) }) + } + / a:lambda_param_no_default()* b:lambda_param_with_default()+ slash:lit("/") &lit(":") { + (concat(a, b), ParamSlash { comma: None }) + } + + rule lambda_star_etc() -> StarEtc<'a> + = star:lit("*") a:lambda_param_no_default() + b:lambda_param_maybe_default()* kw:lambda_kwds()? { + StarEtc(Some(StarArg::Param( + Box::new(add_param_star(a, star)) + )), b, kw) + } + / lit("*") c:comma() b:lambda_param_maybe_default()+ kw:lambda_kwds()? { + StarEtc(Some(StarArg::Star(ParamStar {comma: c})), b, kw) + } + / kw:lambda_kwds() { StarEtc(None, vec![], Some(kw)) } + + rule lambda_kwds() -> Param<'a> + = star:lit("**") a:lambda_param_no_default() { + add_param_star(a, star) + } + + rule lambda_param_no_default() -> Param<'a> + = a:lambda_param() c:lit(",") { + add_param_default(a, None, Some(c)) + } + / a:lambda_param() &lit(":") {a} + + rule lambda_param_with_default() -> Param<'a> + = a:lambda_param() def:default() c:lit(",") { + add_param_default(a, Some(def), Some(c)) + } + / a:lambda_param() def:default() &lit(":") { + add_param_default(a, Some(def), None) + } + + rule lambda_param_maybe_default() -> Param<'a> + = a:lambda_param() def:default()? c:lit(",") { + add_param_default(a, def, Some(c)) + } + / a:lambda_param() def:default()? &lit(":") { + add_param_default(a, def, None) + } + + rule lambda_param() -> Param<'a> + = name:name() { Param { name, ..Default::default() } } + + // Literals + + rule strings() -> String<'a> + = s:(str:tok(STRING, "STRING") t:&_ {(make_string(str), t)} + / str:fstring() t:&_ {(String::Formatted(str), t)})+ { + make_strings(s) + } + + rule list() -> Expression<'a> + = lbrak:lbrak() e:star_named_expressions()? rbrak:rbrak() { + Expression::List( + make_list(lbrak, e.unwrap_or_default(), rbrak) + ) + } + + rule tuple() -> Expression<'a> + = lpar:lpar() first:star_named_expression() &lit(",") + rest:(c:comma() e:star_named_expression() {(c, e)})* + trailing_comma:comma()? rpar:rpar() { + Expression::Tuple( + make_tuple(first, rest, trailing_comma, Some(lpar), Some(rpar)) + ) + } + / lpar:lpar() rpar:lit(")") { + Expression::Tuple(Tuple::default().with_parens( + lpar, RightParen { whitespace_before: Default::default(), rpar_tok: rpar } + ))} + + rule set() -> Expression<'a> + = lbrace:lbrace() e:star_named_expressions()? rbrace:rbrace() { + Expression::Set(make_set(lbrace, e.unwrap_or_default(), rbrace)) + } + + // Dicts + + rule dict() -> Expression<'a> + = lbrace:lbrace() els:double_starred_keypairs()? rbrace:rbrace() { + Expression::Dict(make_dict(lbrace, els.unwrap_or_default(), rbrace)) + } + + + rule double_starred_keypairs() -> Vec> + = pairs:separated_trailer(, ) { + make_double_starred_keypairs(pairs.0, pairs.1, pairs.2) + } + + rule double_starred_kvpair() -> DictElement<'a> + = s:lit("**") e:bitwise_or() { + DictElement::Starred(make_double_starred_element(s, e)) + } + / k:kvpair() { make_dict_element(k) } + + rule kvpair() -> (Expression<'a>, TokenRef<'a>, Expression<'a>) + = k:expression() colon:lit(":") v:expression() { (k, colon, v) } + + // Comprehensions & generators + + rule for_if_clauses() -> CompFor<'a> + = c:for_if_clause()+ { merge_comp_fors(c) } + + rule for_if_clause() -> CompFor<'a> + = asy:_async() f:lit("for") tgt:star_targets() i:lit("in") + iter:disjunction() ifs:_comp_if()* { + make_for_if(Some(asy), f, tgt, i, iter, ifs) + } + / f:lit("for") tgt:star_targets() i:lit("in") + iter:disjunction() ifs:_comp_if()* { + make_for_if(None, f, tgt, i, iter, ifs) + } + + rule _comp_if() -> CompIf<'a> + = kw:lit("if") cond:disjunction() { + make_comp_if(kw, cond) + } + + rule listcomp() -> Expression<'a> + = lbrak:lbrak() elt:named_expression() comp:for_if_clauses() rbrak:rbrak() { + Expression::ListComp(make_list_comp(lbrak, elt, comp, rbrak)) + } + + rule setcomp() -> Expression<'a> + = l:lbrace() elt:named_expression() comp:for_if_clauses() r:rbrace() { + Expression::SetComp(make_set_comp(l, elt, comp, r)) + } + + rule genexp() -> GeneratorExp<'a> + = lpar:lpar() g:_bare_genexp() rpar:rpar() { + g.with_parens(lpar, rpar) + } + + rule _bare_genexp() -> GeneratorExp<'a> + = elt:named_expression() comp:for_if_clauses() { + make_bare_genexp(elt, comp) + } + + rule dictcomp() -> Expression<'a> + = lbrace:lbrace() elt:kvpair() comp:for_if_clauses() rbrace:rbrace() { + Expression::DictComp(make_dict_comp(lbrace, elt, comp, rbrace)) + } + + // Function call arguments + + rule arguments() -> Vec> + = a:args() trail:comma()? &lit(")") {add_arguments_trailing_comma(a, trail)} + + rule args() -> Vec> + = first:_posarg() + rest:(c:comma() a:_posarg() {(c, a)})* + kw:(c:comma() k:kwargs() {(c, k)})? { + let (trail, kw) = kw.map(|(x,y)| (Some(x), Some(y))).unwrap_or((None, None)); + concat( + comma_separate(first, rest, trail), + kw.unwrap_or_default(), + ) + } + / kwargs() + + rule _posarg() -> Arg<'a> + = a:(starred_expression() / e:named_expression() { make_arg(e) }) + !lit("=") { a } + + rule kwargs() -> Vec> + = sitems:separated(, ) + scomma:comma() + ditems:separated(, ) { + concat( + comma_separate(sitems.0, sitems.1, Some(scomma)), + comma_separate(ditems.0, ditems.1, None), + ) + } + / items:separated(, ) { + comma_separate(items.0, items.1, None) + } + / items:separated(, ) { + comma_separate(items.0, items.1, None) + } + + rule starred_expression() -> Arg<'a> + = star:lit("*") e:expression() { make_star_arg(star, e) } + + rule kwarg_or_starred() -> Arg<'a> + = _kwarg() + / starred_expression() + + rule kwarg_or_double_starred() -> Arg<'a> + = _kwarg() + / star:lit("**") e:expression() { make_star_arg(star, e) } + + rule _kwarg() -> Arg<'a> + = n:name() eq:lit("=") v:expression() { + make_kwarg(n, eq, v) + } + + // Assignment targets + // Generic targets + + rule star_targets() -> AssignTargetExpression<'a> + = a:star_target() !lit(",") {a} + / targets:separated_trailer(, ) { + AssignTargetExpression::Tuple( + make_tuple(targets.0, targets.1, targets.2, None, None) + ) + } + + rule star_targets_list_seq() -> Vec> + = targets:separated_trailer(, ) { + comma_separate(targets.0, targets.1, targets.2) + } + + // This differs from star_targets below because it requires at least two items + // in the tuple + rule star_targets_tuple_seq() -> Tuple<'a> + = first:(t:star_target() {assign_target_to_element(t)}) + rest:(c:comma() t:star_target() {(c, assign_target_to_element(t))})+ + trail:comma()? { + make_tuple(first, rest, trail, None, None) + } + / t:star_target() trail:comma()? { + make_tuple(assign_target_to_element(t), vec![], trail, None, None) + } + + #[cache] + rule star_target() -> AssignTargetExpression<'a> + = star:lit("*") !lit("*") t:star_target() { + AssignTargetExpression::StarredElement( + make_starred_element(star, assign_target_to_element(t)) + ) + } + / target_with_star_atom() + + #[cache] + rule target_with_star_atom() -> AssignTargetExpression<'a> + = a:t_primary() dot:lit(".") n:name() !t_lookahead() { + AssignTargetExpression::Attribute(make_attribute(a, dot, n)) + } + / a:t_primary() lbrak:lbrak() s:slices() rbrak:rbrak() !t_lookahead() { + AssignTargetExpression::Subscript( + make_subscript(a, lbrak, s, rbrak) + ) + } + / a:star_atom() {a} + + rule star_atom() -> AssignTargetExpression<'a> + = a:name() { AssignTargetExpression::Name(a) } + / lpar:lpar() a:target_with_star_atom() rpar:rpar() { a.with_parens(lpar, rpar) } + / lpar:lpar() a:star_targets_tuple_seq()? rpar:rpar() { + AssignTargetExpression::Tuple( + a.unwrap_or_default().with_parens(lpar, rpar) + ) + } + / lbrak:lbrak() a:star_targets_list_seq()? rbrak:rbrak() { + AssignTargetExpression::List( + make_list(lbrak, a.unwrap_or_default(), rbrak) + ) + } + + rule single_target() -> AssignTargetExpression<'a> + = single_subscript_attribute_target() + / n:name() { AssignTargetExpression::Name(n) } + / lpar:lpar() t:single_target() rpar:rpar() { t.with_parens(lpar, rpar) } + + rule single_subscript_attribute_target() -> AssignTargetExpression<'a> + = a:t_primary() dot:lit(".") n:name() !t_lookahead() { + AssignTargetExpression::Attribute(make_attribute(a, dot, n)) + } + / a:t_primary() lbrak:lbrak() s:slices() rbrak:rbrak() !t_lookahead() { + AssignTargetExpression::Subscript( + make_subscript(a, lbrak, s, rbrak) + ) + } + + + #[cache_left_rec] + rule t_primary() -> Expression<'a> + = value:t_primary() dot:lit(".") attr:name() &t_lookahead() { + Expression::Attribute(make_attribute(value, dot, attr)) + } + / v:t_primary() l:lbrak() s:slices() r:rbrak() &t_lookahead() { + Expression::Subscript(make_subscript(v, l, s, r)) + } + / f:t_primary() gen:genexp() &t_lookahead() { + Expression::Call(make_genexp_call(f, gen)) + } + / f:t_primary() lpar:lit("(") arg:arguments()? rpar:lit(")") &t_lookahead() { + Expression::Call(make_call(f, lpar, arg.unwrap_or_default(), rpar)) + } + / a:atom() &t_lookahead() {a} + + rule t_lookahead() -> () + = (lit("(") / lit("[") / lit(".")) {} + + // Targets for del statements + + rule del_targets() -> Vec> + = t:separated_trailer(, ) { + comma_separate(t.0, t.1, t.2) + } + + rule del_target() -> DelTargetExpression<'a> + = a:t_primary() d:lit(".") n:name() !t_lookahead() { + DelTargetExpression::Attribute(make_attribute(a, d, n)) + } + / a:t_primary() lbrak:lbrak() s:slices() rbrak:rbrak() !t_lookahead() { + DelTargetExpression::Subscript( + make_subscript(a, lbrak, s, rbrak) + ) + } + / del_t_atom() + + rule del_t_atom() -> DelTargetExpression<'a> + = n:name() { DelTargetExpression::Name(n) } + / l:lpar() d:del_target() r:rpar() { d.with_parens(l, r) } + / l:lpar() d:del_targets()? r:rpar() { + make_del_tuple(Some(l), d.unwrap_or_default(), Some(r)) + } + / l:lbrak() d:del_targets()? r:rbrak() { + DelTargetExpression::List( + make_list(l, d.unwrap_or_default(), r) + ) + } + + // F-strings + + rule fstring() -> FormattedString<'a> + = start:tok(FStringStart, "f\"") + parts:(_f_string() / _f_replacement())* + end:tok(FStringEnd, "\"") { + make_fstring(start.string, parts, end.string) + } + + rule _f_string() -> FormattedStringContent<'a> + = t:tok(FStringString, "f-string contents") { + FormattedStringContent::Text(FormattedStringText { value: t.string }) + } + + rule _f_replacement() -> FormattedStringContent<'a> + = lb:lit("{") e:_f_expr() eq:lit("=")? + conv:(t:lit("!") c:_f_conversion() {(t,c)})? + spec:(t:lit(":") s:_f_spec() {(t,s)})? + rb:lit("}") { + FormattedStringContent::Expression( + make_fstring_expression(lb, e, eq, conv, spec, rb) + ) + } + + rule _f_expr() -> Expression<'a> + = (g:_bare_genexp() {Expression::GeneratorExp(g)}) + / _conditional_expression() + / yield_expr() + + rule _f_conversion() -> &'a str + = lit("r") {"r"} / lit("s") {"s"} / lit("a") {"a"} + + rule _f_spec() -> Vec> + = (_f_string() / _f_replacement())* + + // CST helpers + + rule comma() -> Comma<'a> + = c:lit(",") { make_comma(c) } + + rule dots() -> Vec> + = ds:((dot:lit(".") { make_dot(dot) })+ + / tok:lit("...") { + vec![make_dot(tok.clone()), make_dot(tok.clone()), make_dot(tok.clone())]} + )+ { ds.into_iter().flatten().collect() } + + rule lpar() -> LeftParen<'a> + = a:lit("(") { make_lpar(a) } + + rule rpar() -> RightParen<'a> + = a:lit(")") { make_rpar(a) } + + rule lbrak() -> LeftSquareBracket<'a> + = tok:lit("[") { make_left_bracket(tok) } + + rule rbrak() -> RightSquareBracket<'a> + = tok:lit("]") { make_right_bracket(tok) } + + rule lbrace() -> LeftCurlyBrace<'a> + = tok:lit("{") { make_left_brace(tok) } + + rule rbrace() -> RightCurlyBrace<'a> + = tok:lit("}") { make_right_brace(tok) } + + /// matches any token, not just whitespace + rule _() -> TokenRef<'a> + = [t] { t } + + rule lit(lit: &'static str) -> TokenRef<'a> + = [t] {? if t.string == lit { Ok(t) } else { Err(lit) } } + + rule tok(tok: TokType, err: &'static str) -> TokenRef<'a> + = [t] {? if t.r#type == tok { Ok(t) } else { Err(err) } } + + rule name() -> Name<'a> + = !( lit("False") / lit("None") / lit("True") / lit("and") / lit("as") / lit("assert") / lit("async") / lit("await") + / lit("break") / lit("class") / lit("continue") / lit("def") / lit("del") / lit("elif") / lit("else") + / lit("except") / lit("finally") / lit("for") / lit("from") / lit("global") / lit("if") / lit("import") + / lit("in") / lit("is") / lit("lambda") / lit("nonlocal") / lit("not") / lit("or") / lit("pass") / lit("raise") + / lit("return") / lit("try") / lit("while") / lit("with") / lit("yield") + ) + t:tok(NameTok, "NAME") {make_name(t)} + + rule _async() -> TokenRef<'a> + = tok(Async, "ASYNC") + + rule separated_trailer(el: rule, sep: rule) -> (El, Vec<(Sep, El)>, Option) + = e:el() rest:(s:sep() e:el() {(s, e)})* trailer:sep()? {(e, rest, trailer)} + + rule separated(el: rule, sep: rule) -> (El, Vec<(Sep, El)>) + = e:el() rest:(s:sep() e:el() {(s, e)})* {(e, rest)} + + rule traced(e: rule) -> T = + &(_* { + #[cfg(feature = "trace")] + { + println!("[PEG_INPUT_START]"); + println!("{}", input); + println!("[PEG_TRACE_START]"); + } + }) + e:e()? {? + #[cfg(feature = "trace")] + println!("[PEG_TRACE_STOP]"); + e.ok_or("") + } + + } +} + +#[allow(clippy::too_many_arguments)] +fn make_function_def<'a>( + async_tok: Option>, + def_tok: TokenRef<'a>, + name: Name<'a>, + open_paren_tok: TokenRef<'a>, + params: Option>, + close_paren_tok: TokenRef<'a>, + returns: Option>, + colon_tok: TokenRef<'a>, + body: Suite<'a>, +) -> FunctionDef<'a> { + let asynchronous = async_tok.as_ref().map(|_| Asynchronous { + whitespace_after: Default::default(), + }); + FunctionDef { + name, + params: params.unwrap_or_default(), + body, + decorators: Default::default(), + returns, + asynchronous, + leading_lines: Default::default(), + lines_after_decorators: vec![], + whitespace_after_def: Default::default(), + whitespace_after_name: Default::default(), + whitespace_before_colon: Default::default(), + whitespace_before_params: Default::default(), + async_tok, + def_tok, + open_paren_tok, + close_paren_tok, + colon_tok, + } +} + +fn make_decorator<'a>( + at_tok: TokenRef<'a>, + name: Expression<'a>, + newline_tok: TokenRef<'a>, +) -> Decorator<'a> { + Decorator { + decorator: name, + leading_lines: Default::default(), + whitespace_after_at: Default::default(), + trailing_whitespace: Default::default(), + newline_tok, + at_tok, + } +} + +fn make_comparison<'a>( + head: Expression<'a>, + tail: Vec<(CompOp<'a>, Expression<'a>)>, +) -> Expression<'a> { + let mut comparisons = vec![]; + for (operator, e) in tail { + comparisons.push(ComparisonTarget { + operator, + comparator: e, + }); + } + Expression::Comparison(Comparison { + left: Box::new(head), + comparisons, + lpar: vec![], + rpar: vec![], + }) +} + +fn make_comparison_operator(tok: TokenRef) -> Result { + let whitespace_before = Default::default(); + let whitespace_after = Default::default(); + match tok.string { + "<" => Ok(CompOp::LessThan { + whitespace_after, + whitespace_before, + tok, + }), + ">" => Ok(CompOp::GreaterThan { + whitespace_after, + whitespace_before, + tok, + }), + "<=" => Ok(CompOp::LessThanEqual { + whitespace_after, + whitespace_before, + tok, + }), + ">=" => Ok(CompOp::GreaterThanEqual { + whitespace_after, + whitespace_before, + tok, + }), + "==" => Ok(CompOp::Equal { + whitespace_after, + whitespace_before, + tok, + }), + "!=" => Ok(CompOp::NotEqual { + whitespace_after, + whitespace_before, + tok, + }), + "in" => Ok(CompOp::In { + whitespace_after, + whitespace_before, + tok, + }), + "is" => Ok(CompOp::Is { + whitespace_after, + whitespace_before, + tok, + }), + _ => Err(ParserError::OperatorError), + } +} + +fn make_comparison_operator_2<'a>( + first: TokenRef<'a>, + second: TokenRef<'a>, +) -> Result<'a, CompOp<'a>> { + let whitespace_before = Default::default(); + let whitespace_between = Default::default(); + let whitespace_after = Default::default(); + + match (first.string, second.string) { + ("is", "not") => Ok(CompOp::IsNot { + whitespace_before, + whitespace_between, + whitespace_after, + is_tok: first, + not_tok: second, + }), + ("not", "in") => Ok(CompOp::NotIn { + whitespace_before, + whitespace_between, + whitespace_after, + not_tok: first, + in_tok: second, + }), + _ => Err(ParserError::OperatorError), + } +} + +fn make_boolean_op<'a>( + head: Expression<'a>, + tail: Vec<(TokenRef<'a>, Expression<'a>)>, +) -> Result<'a, Expression<'a>> { + if tail.is_empty() { + return Ok(head); + } + + let mut expr = head; + for (tok, right) in tail { + expr = Expression::BooleanOperation(BooleanOperation { + left: Box::new(expr), + operator: make_boolean_operator(tok)?, + right: Box::new(right), + lpar: vec![], + rpar: vec![], + }) + } + Ok(expr) +} + +fn make_boolean_operator(tok: TokenRef) -> Result { + let whitespace_before = Default::default(); + let whitespace_after = Default::default(); + match tok.string { + "and" => Ok(BooleanOp::And { + whitespace_after, + whitespace_before, + tok, + }), + "or" => Ok(BooleanOp::Or { + whitespace_after, + whitespace_before, + tok, + }), + _ => Err(ParserError::OperatorError), + } +} + +fn make_binary_op<'a>( + left: Expression<'a>, + op: TokenRef<'a>, + right: Expression<'a>, +) -> Result<'a, Expression<'a>> { + let operator = make_binary_operator(op)?; + Ok(Expression::BinaryOperation(BinaryOperation { + left: Box::new(left), + operator, + right: Box::new(right), + lpar: vec![], + rpar: vec![], + })) +} + +fn make_binary_operator(tok: TokenRef) -> Result { + let whitespace_before = Default::default(); + let whitespace_after = Default::default(); + + match tok.string { + "+" => Ok(BinaryOp::Add { + whitespace_after, + whitespace_before, + tok, + }), + "-" => Ok(BinaryOp::Subtract { + whitespace_after, + whitespace_before, + tok, + }), + "*" => Ok(BinaryOp::Multiply { + whitespace_after, + whitespace_before, + tok, + }), + "/" => Ok(BinaryOp::Divide { + whitespace_after, + whitespace_before, + tok, + }), + "//" => Ok(BinaryOp::FloorDivide { + whitespace_after, + whitespace_before, + tok, + }), + "%" => Ok(BinaryOp::Modulo { + whitespace_after, + whitespace_before, + tok, + }), + "**" => Ok(BinaryOp::Power { + whitespace_after, + whitespace_before, + tok, + }), + "<<" => Ok(BinaryOp::LeftShift { + whitespace_after, + whitespace_before, + tok, + }), + ">>" => Ok(BinaryOp::RightShift { + whitespace_after, + whitespace_before, + tok, + }), + "|" => Ok(BinaryOp::BitOr { + whitespace_after, + whitespace_before, + tok, + }), + "&" => Ok(BinaryOp::BitAnd { + whitespace_after, + whitespace_before, + tok, + }), + "^" => Ok(BinaryOp::BitXor { + whitespace_after, + whitespace_before, + tok, + }), + "@" => Ok(BinaryOp::MatrixMultiply { + whitespace_after, + whitespace_before, + tok, + }), + _ => Err(ParserError::OperatorError), + } +} + +fn make_unary_op<'a>(op: TokenRef<'a>, tail: Expression<'a>) -> Result<'a, Expression<'a>> { + let operator = make_unary_operator(op)?; + Ok(Expression::UnaryOperation(UnaryOperation { + operator, + expression: Box::new(tail), + lpar: vec![], + rpar: vec![], + })) +} + +fn make_unary_operator(tok: TokenRef) -> Result { + let whitespace_after = Default::default(); + match tok.string { + "+" => Ok(UnaryOp::Plus { + whitespace_after, + tok, + }), + "-" => Ok(UnaryOp::Minus { + whitespace_after, + tok, + }), + "~" => Ok(UnaryOp::BitInvert { + whitespace_after, + tok, + }), + "not" => Ok(UnaryOp::Not { + whitespace_after, + tok, + }), + _ => Err(ParserError::OperatorError), + } +} + +fn make_number(num: TokenRef) -> Expression { + super::numbers::parse_number(num.string) +} + +fn make_indented_block<'a>( + nl: TokenRef<'a>, + indent: TokenRef<'a>, + statements: Vec>, + dedent: TokenRef<'a>, +) -> Suite<'a> { + Suite::IndentedBlock(IndentedBlock { + body: statements, + header: Default::default(), + indent: Default::default(), + footer: Default::default(), + newline_tok: nl, + indent_tok: indent, + dedent_tok: dedent, + }) +} + +struct SimpleStatementParts<'a> { + first_tok: TokenRef<'a>, // The first token of the first statement. Used for its whitespace + first_statement: SmallStatement<'a>, + rest: Vec<(TokenRef<'a>, SmallStatement<'a>)>, // semicolon, statement pairs + last_semi: Option>, + nl: TokenRef<'a>, +} + +fn make_semicolon(tok: TokenRef) -> Semicolon { + Semicolon { + whitespace_before: Default::default(), + whitespace_after: Default::default(), + tok, + } +} + +fn _make_simple_statement( + parts: SimpleStatementParts, +) -> (TokenRef, Vec, TokenRef) { + let mut body = vec![]; + + let mut current = parts.first_statement; + for (semi, next) in parts.rest { + body.push(current.with_semicolon(Some(make_semicolon(semi)))); + current = next; + } + if let Some(semi) = parts.last_semi { + current = current.with_semicolon(Some(make_semicolon(semi))); + } + body.push(current); + + (parts.first_tok, body, parts.nl) +} + +fn make_simple_statement_suite(parts: SimpleStatementParts) -> Suite { + let (first_tok, body, newline_tok) = _make_simple_statement(parts); + + Suite::SimpleStatementSuite(SimpleStatementSuite { + body, + leading_whitespace: Default::default(), + trailing_whitespace: Default::default(), + first_tok, + newline_tok, + }) +} + +fn make_simple_statement_line(parts: SimpleStatementParts) -> SimpleStatementLine { + let (first_tok, body, newline_tok) = _make_simple_statement(parts); + SimpleStatementLine { + body, + leading_lines: Default::default(), + trailing_whitespace: Default::default(), + first_tok, + newline_tok, + } +} + +fn make_if<'a>( + if_tok: TokenRef<'a>, + cond: Expression<'a>, + colon_tok: TokenRef<'a>, + block: Suite<'a>, + orelse: Option>, + is_elif: bool, +) -> If<'a> { + If { + leading_lines: Default::default(), + whitespace_before_test: Default::default(), + test: cond, + whitespace_after_test: Default::default(), + body: block, + orelse: orelse.map(Box::new), + is_elif, + if_tok, + colon_tok, + } +} + +fn make_else<'a>(else_tok: TokenRef<'a>, colon_tok: TokenRef<'a>, block: Suite<'a>) -> Else<'a> { + Else { + leading_lines: Default::default(), + whitespace_before_colon: Default::default(), + body: block, + else_tok, + colon_tok, + } +} + +struct StarEtc<'a>(Option>, Vec>, Option>); + +fn make_parameters<'a>( + posonly: Option<(Vec>, ParamSlash<'a>)>, + params: Vec>, + star_etc: Option>, +) -> Parameters<'a> { + let (posonly_params, posonly_ind) = match posonly { + Some((a, b)) => (a, Some(b)), + None => (vec![], None), + }; + let (star_arg, kwonly_params, star_kwarg) = match star_etc { + None => (None, vec![], None), + Some(StarEtc(a, b, c)) => (a, b, c), + }; + Parameters { + params, + star_arg, + kwonly_params, + star_kwarg, + posonly_params, + posonly_ind, + } +} + +fn add_param_default<'a>( + param: Param<'a>, + def: Option<(AssignEqual<'a>, Expression<'a>)>, + comma_tok: Option>, +) -> Param<'a> { + let comma = comma_tok.map(make_comma); + + let (equal, default) = match def { + Some((a, b)) => (Some(a), Some(b)), + None => (None, None), + }; + Param { + equal, + default, + comma, + ..param + } +} + +fn add_param_star<'a>(param: Param<'a>, star: TokenRef<'a>) -> Param<'a> { + let str = star.string; + Param { + star: Some(str), + star_tok: Some(star), + ..param + } +} + +fn make_assign_equal(tok: TokenRef) -> AssignEqual { + AssignEqual { + whitespace_before: Default::default(), + whitespace_after: Default::default(), + tok, + } +} + +fn make_comma(tok: TokenRef) -> Comma { + Comma { + whitespace_before: Default::default(), + whitespace_after: Default::default(), + tok, + } +} + +fn concat(a: Vec, b: Vec) -> Vec { + a.into_iter().chain(b.into_iter()).collect() +} + +fn make_name_or_attr<'a>( + first_tok: Name<'a>, + mut tail: Vec<(TokenRef<'a>, Name<'a>)>, +) -> NameOrAttribute<'a> { + if let Some((dot, name)) = tail.pop() { + let dot = make_dot(dot); + return NameOrAttribute::A(Attribute { + attr: name, + dot, + lpar: Default::default(), + rpar: Default::default(), + value: Box::new(make_name_or_attr(first_tok, tail).into()), + }); + } else { + NameOrAttribute::N(first_tok) + } +} + +fn make_name(tok: TokenRef) -> Name { + Name { + value: tok.string, + ..Default::default() + } +} + +fn make_dot(tok: TokenRef) -> Dot { + Dot { + whitespace_before: Default::default(), + whitespace_after: Default::default(), + tok, + } +} + +fn make_import_alias<'a>( + name: NameOrAttribute<'a>, + asname: Option<(TokenRef<'a>, Name<'a>)>, +) -> ImportAlias<'a> { + ImportAlias { + name, + asname: asname.map(|(x, y)| make_as_name(x, AssignTargetExpression::Name(y))), + comma: None, + } +} + +fn make_as_name<'a>(as_tok: TokenRef<'a>, name: AssignTargetExpression<'a>) -> AsName<'a> { + AsName { + name, + whitespace_before_as: Default::default(), + whitespace_after_as: Default::default(), + as_tok, + } +} + +type ParenthesizedImportNames<'a> = ( + Option>, + ImportNames<'a>, + Option>, +); + +fn make_import_from<'a>( + from_tok: TokenRef<'a>, + dots: Vec>, + module: Option>, + import_tok: TokenRef<'a>, + aliases: ParenthesizedImportNames<'a>, +) -> ImportFrom<'a> { + let (lpar, names, rpar) = aliases; + + ImportFrom { + module, + names, + relative: dots, + lpar, + rpar, + semicolon: None, + whitespace_after_from: Default::default(), + whitespace_after_import: Default::default(), + whitespace_before_import: Default::default(), + from_tok, + import_tok, + } +} + +fn make_import<'a>(import_tok: TokenRef<'a>, names: Vec>) -> Import<'a> { + Import { + names, + whitespace_after_import: Default::default(), + semicolon: None, + import_tok, + } +} + +fn make_import_from_as_names<'a>( + first: ImportAlias<'a>, + tail: Vec<(Comma<'a>, ImportAlias<'a>)>, +) -> Vec> { + let mut ret = vec![]; + let mut cur = first; + for (comma, alias) in tail { + ret.push(cur.with_comma(comma)); + cur = alias; + } + ret.push(cur); + ret +} + +fn make_lpar(tok: TokenRef) -> LeftParen { + LeftParen { + whitespace_after: Default::default(), + lpar_tok: tok, + } +} + +fn make_rpar(tok: TokenRef) -> RightParen { + RightParen { + whitespace_before: Default::default(), + rpar_tok: tok, + } +} + +fn make_module<'a>(body: Vec>, tok: TokenRef<'a>, encoding: &str) -> Module<'a> { + Module { + body, + header: Default::default(), + footer: Default::default(), + eof_tok: tok, + default_indent: " ", + default_newline: "\n", + has_trailing_newline: false, + encoding: encoding.to_string(), + } +} + +fn make_attribute<'a>(value: Expression<'a>, dot: TokenRef<'a>, attr: Name<'a>) -> Attribute<'a> { + let dot = make_dot(dot); + Attribute { + attr, + dot, + lpar: Default::default(), + rpar: Default::default(), + value: Box::new(value), + } +} + +fn make_starred_element<'a>(star_tok: TokenRef<'a>, rest: Element<'a>) -> StarredElement<'a> { + let value = match rest { + Element::Simple { value, .. } => value, + _ => panic!("Internal error while making starred element"), + }; + StarredElement { + value: Box::new(value), + whitespace_before_value: Default::default(), + lpar: Default::default(), + rpar: Default::default(), + comma: Default::default(), + star_tok, + } +} + +fn assign_target_to_element(expr: AssignTargetExpression) -> Element { + match expr { + AssignTargetExpression::Attribute(a) => Element::Simple { + value: Expression::Attribute(a), + comma: Default::default(), + }, + AssignTargetExpression::Name(a) => Element::Simple { + value: Expression::Name(a), + comma: Default::default(), + }, + AssignTargetExpression::Tuple(a) => Element::Simple { + value: Expression::Tuple(a), + comma: Default::default(), + }, + AssignTargetExpression::StarredElement(s) => Element::Starred(s), + AssignTargetExpression::List(l) => Element::Simple { + value: Expression::List(l), + comma: Default::default(), + }, + AssignTargetExpression::Subscript(s) => Element::Simple { + value: Expression::Subscript(s), + comma: Default::default(), + }, + } +} + +fn make_assignment<'a>( + lhs: Vec<(AssignTargetExpression<'a>, TokenRef<'a>)>, + rhs: Expression<'a>, +) -> Assign<'a> { + let mut targets = vec![]; + for (target, equal_tok) in lhs { + targets.push(AssignTarget { + target, + whitespace_before_equal: Default::default(), + whitespace_after_equal: Default::default(), + equal_tok, + }); + } + Assign { + targets, + value: rhs, + semicolon: Default::default(), + } +} + +fn expr_to_element(expr: Expression) -> Element { + Element::Simple { + value: expr, + comma: Default::default(), + } +} + +fn make_tuple<'a>( + first: Element<'a>, + rest: Vec<(Comma<'a>, Element<'a>)>, + trailing_comma: Option>, + lpar: Option>, + rpar: Option>, +) -> Tuple<'a> { + let elements = comma_separate(first, rest, trailing_comma); + + let lpar = lpar.map(|l| vec![l]).unwrap_or_default(); + let rpar = rpar.map(|r| vec![r]).unwrap_or_default(); + + Tuple { + elements, + lpar, + rpar, + } +} + +fn make_kwarg<'a>(name: Name<'a>, eq: TokenRef<'a>, value: Expression<'a>) -> Arg<'a> { + let equal = Some(make_assign_equal(eq)); + let keyword = Some(name); + Arg { + value, + keyword, + equal, + comma: None, + star: "", + whitespace_after_star: Default::default(), + whitespace_after_arg: Default::default(), + star_tok: None, + } +} + +fn make_star_arg<'a>(star: TokenRef<'a>, expr: Expression<'a>) -> Arg<'a> { + let str = star.string; + Arg { + value: expr, + keyword: None, + equal: None, + comma: None, + star: str, + whitespace_after_star: Default::default(), + whitespace_after_arg: Default::default(), + star_tok: Some(star), + } +} + +fn make_call<'a>( + func: Expression<'a>, + lpar_tok: TokenRef<'a>, + args: Vec>, + rpar_tok: TokenRef<'a>, +) -> Call<'a> { + let lpar = vec![]; + let rpar = vec![]; + let func = Box::new(func); + + Call { + func, + args, + lpar, + rpar, + whitespace_after_func: Default::default(), + whitespace_before_args: Default::default(), + lpar_tok, + rpar_tok, + } +} + +fn make_genexp_call<'a>(func: Expression<'a>, mut genexp: GeneratorExp<'a>) -> Call<'a> { + // func ( (genexp) ) + // ^ + // lpar_tok + + // lpar_tok is the same token that was used to parse genexp's first lpar. + // Nothing owns the whitespace before lpar_tok, so the same token is passed in here + // again, to be converted into whitespace_after_func. We then split off a pair of + // parenthesis from genexp, since now Call will own them. + + let mut lpars = genexp.lpar.into_iter(); + let lpar_tok = lpars.next().expect("genexp without lpar").lpar_tok; + genexp.lpar = lpars.collect(); + let rpar_tok = genexp.rpar.pop().expect("genexp without rpar").rpar_tok; + + Call { + func: Box::new(func), + args: vec![Arg { + value: Expression::GeneratorExp(genexp), + keyword: None, + equal: None, + comma: None, + star: "", + whitespace_after_star: Default::default(), + whitespace_after_arg: Default::default(), + star_tok: None, + }], + lpar: vec![], + rpar: vec![], + whitespace_after_func: Default::default(), + whitespace_before_args: Default::default(), + lpar_tok, + rpar_tok, + } +} + +fn make_arg(expr: Expression) -> Arg { + Arg { + value: expr, + keyword: Default::default(), + equal: Default::default(), + comma: Default::default(), + star: Default::default(), + whitespace_after_star: Default::default(), + whitespace_after_arg: Default::default(), + star_tok: None, + } +} + +fn make_comp_if<'a>(if_tok: TokenRef<'a>, test: Expression<'a>) -> CompIf<'a> { + CompIf { + test, + whitespace_before: Default::default(), + whitespace_before_test: Default::default(), + if_tok, + } +} + +fn make_for_if<'a>( + async_tok: Option>, + for_tok: TokenRef<'a>, + target: AssignTargetExpression<'a>, + in_tok: TokenRef<'a>, + iter: Expression<'a>, + ifs: Vec>, +) -> CompFor<'a> { + let inner_for_in = None; + let asynchronous = async_tok.as_ref().map(|_| Asynchronous { + whitespace_after: Default::default(), + }); + + CompFor { + target, + iter, + ifs, + inner_for_in, + asynchronous, + whitespace_before: Default::default(), + whitespace_after_for: Default::default(), + whitespace_before_in: Default::default(), + whitespace_after_in: Default::default(), + async_tok, + for_tok, + in_tok, + } +} + +fn make_bare_genexp<'a>(elt: Expression<'a>, for_in: CompFor<'a>) -> GeneratorExp<'a> { + GeneratorExp { + elt: Box::new(elt), + for_in: Box::new(for_in), + lpar: Default::default(), + rpar: Default::default(), + } +} + +fn merge_comp_fors(comp_fors: Vec) -> CompFor { + let mut it = comp_fors.into_iter().rev(); + let first = it.next().expect("cant merge empty comp_fors"); + + it.fold(first, |acc, curr| CompFor { + inner_for_in: Some(Box::new(acc)), + ..curr + }) +} + +fn make_left_bracket(tok: TokenRef) -> LeftSquareBracket { + LeftSquareBracket { + whitespace_after: Default::default(), + tok, + } +} + +fn make_right_bracket(tok: TokenRef) -> RightSquareBracket { + RightSquareBracket { + whitespace_before: Default::default(), + tok, + } +} + +fn make_left_brace(tok: TokenRef) -> LeftCurlyBrace { + LeftCurlyBrace { + whitespace_after: Default::default(), + tok, + } +} + +fn make_right_brace(tok: TokenRef) -> RightCurlyBrace { + RightCurlyBrace { + whitespace_before: Default::default(), + tok, + } +} + +fn make_list_comp<'a>( + lbracket: LeftSquareBracket<'a>, + elt: Expression<'a>, + for_in: CompFor<'a>, + rbracket: RightSquareBracket<'a>, +) -> ListComp<'a> { + ListComp { + elt: Box::new(elt), + for_in: Box::new(for_in), + lbracket, + rbracket, + lpar: Default::default(), + rpar: Default::default(), + } +} + +fn make_set_comp<'a>( + lbrace: LeftCurlyBrace<'a>, + elt: Expression<'a>, + for_in: CompFor<'a>, + rbrace: RightCurlyBrace<'a>, +) -> SetComp<'a> { + SetComp { + elt: Box::new(elt), + for_in: Box::new(for_in), + lbrace, + rbrace, + lpar: Default::default(), + rpar: Default::default(), + } +} + +fn make_dict_comp<'a>( + lbrace: LeftCurlyBrace<'a>, + kvpair: (Expression<'a>, TokenRef<'a>, Expression<'a>), + for_in: CompFor<'a>, + rbrace: RightCurlyBrace<'a>, +) -> DictComp<'a> { + let (key, colon_tok, value) = kvpair; + + DictComp { + key: Box::new(key), + value: Box::new(value), + for_in: Box::new(for_in), + lbrace, + rbrace, + lpar: vec![], + rpar: vec![], + whitespace_before_colon: Default::default(), + whitespace_after_colon: Default::default(), + colon_tok, + } +} + +fn make_list<'a>( + lbracket: LeftSquareBracket<'a>, + elements: Vec>, + rbracket: RightSquareBracket<'a>, +) -> List<'a> { + List { + elements, + lbracket, + rbracket, + lpar: Default::default(), + rpar: Default::default(), + } +} + +fn make_set<'a>( + lbrace: LeftCurlyBrace<'a>, + elements: Vec>, + rbrace: RightCurlyBrace<'a>, +) -> Set<'a> { + Set { + elements, + lbrace, + rbrace, + lpar: Default::default(), + rpar: Default::default(), + } +} + +fn comma_separate<'a, T>( + first: T, + rest: Vec<(Comma<'a>, T)>, + trailing_comma: Option>, +) -> Vec +where + T: WithComma<'a>, +{ + let mut elements = vec![]; + let mut current = first; + for (comma, next) in rest { + elements.push(current.with_comma(comma)); + current = next; + } + if let Some(comma) = trailing_comma { + current = current.with_comma(comma); + } + elements.push(current); + elements +} + +fn make_dict<'a>( + lbrace: LeftCurlyBrace<'a>, + elements: Vec>, + rbrace: RightCurlyBrace<'a>, +) -> Dict<'a> { + Dict { + elements, + lbrace, + rbrace, + lpar: Default::default(), + rpar: Default::default(), + } +} + +fn make_double_starred_keypairs<'a>( + first: DictElement<'a>, + rest: Vec<(Comma<'a>, DictElement<'a>)>, + trailing_comma: Option>, +) -> Vec> { + let mut elements = vec![]; + let mut current = first; + for (comma, next) in rest { + elements.push(current.with_comma(comma)); + current = next; + } + if let Some(mut comma) = trailing_comma { + // don't consume trailing whitespace for trailing comma + comma.whitespace_after = ParenthesizableWhitespace::SimpleWhitespace(SimpleWhitespace("")); + current = current.with_comma(comma); + } + elements.push(current); + elements +} + +fn make_dict_element<'a>(el: (Expression<'a>, TokenRef<'a>, Expression<'a>)) -> DictElement<'a> { + let (key, colon_tok, value) = el; + DictElement::Simple { + key, + value, + comma: Default::default(), + whitespace_before_colon: Default::default(), + whitespace_after_colon: Default::default(), + colon_tok, + } +} + +fn make_double_starred_element<'a>( + star_tok: TokenRef<'a>, + value: Expression<'a>, +) -> StarredDictElement<'a> { + StarredDictElement { + value, + comma: Default::default(), + whitespace_before_value: Default::default(), + star_tok, + } +} + +fn make_index(value: Expression) -> BaseSlice { + BaseSlice::Index(Index { value }) +} + +fn make_colon(tok: TokenRef) -> Colon { + let whitespace_before = Default::default(); + let whitespace_after = Default::default(); + Colon { + whitespace_before, + whitespace_after, + tok, + } +} + +fn make_slice<'a>( + lower: Option>, + first_colon: TokenRef<'a>, + upper: Option>, + rest: Option<(TokenRef<'a>, Option>)>, +) -> BaseSlice<'a> { + let first_colon = make_colon(first_colon); + let (second_colon, step) = if let Some((tok, step)) = rest { + (Some(make_colon(tok)), step) + } else { + (None, None) + }; + BaseSlice::Slice(Slice { + lower, + upper, + step, + first_colon, + second_colon, + }) +} + +fn make_slices<'a>( + first: BaseSlice<'a>, + rest: Vec<(Comma<'a>, BaseSlice<'a>)>, + trailing_comma: Option>, +) -> Vec> { + let mut elements = vec![]; + let mut current = first; + for (comma, next) in rest { + elements.push(SubscriptElement { + slice: current, + comma: Some(comma), + }); + current = next; + } + elements.push(SubscriptElement { + slice: current, + comma: trailing_comma, + }); + elements +} + +fn make_subscript<'a>( + value: Expression<'a>, + lbracket: LeftSquareBracket<'a>, + slice: Vec>, + rbracket: RightSquareBracket<'a>, +) -> Subscript<'a> { + let lbracket_tok = lbracket.tok.clone(); + Subscript { + value: Box::new(value), + slice, + lbracket, + rbracket, + lpar: Default::default(), + rpar: Default::default(), + whitespace_after_value: Default::default(), + lbracket_tok, + } +} + +fn make_ifexp<'a>( + body: Expression<'a>, + if_tok: TokenRef<'a>, + test: Expression<'a>, + else_tok: TokenRef<'a>, + orelse: Expression<'a>, +) -> IfExp<'a> { + IfExp { + test: Box::new(test), + body: Box::new(body), + orelse: Box::new(orelse), + lpar: Default::default(), + rpar: Default::default(), + whitespace_before_if: Default::default(), + whitespace_after_if: Default::default(), + whitespace_before_else: Default::default(), + whitespace_after_else: Default::default(), + if_tok, + else_tok, + } +} + +fn add_arguments_trailing_comma<'a>( + mut args: Vec>, + trailing_comma: Option>, +) -> Vec> { + if let Some(comma) = trailing_comma { + let last = args.pop().unwrap(); + args.push(last.with_comma(comma)); + } + args +} + +fn make_lambda<'a>( + lambda_tok: TokenRef<'a>, + params: Parameters<'a>, + colon_tok: TokenRef<'a>, + expr: Expression<'a>, +) -> Lambda<'a> { + let colon = make_colon(colon_tok); + Lambda { + params: Box::new(params), + body: Box::new(expr), + colon, + lpar: Default::default(), + rpar: Default::default(), + whitespace_after_lambda: Default::default(), + lambda_tok, + } +} + +fn make_annotation<'a>(tok: TokenRef<'a>, ann: Expression<'a>) -> Annotation<'a> { + Annotation { + annotation: ann, + whitespace_before_indicator: Default::default(), + whitespace_after_indicator: Default::default(), + tok, + } +} + +fn make_ann_assignment<'a>( + target: AssignTargetExpression<'a>, + col: TokenRef<'a>, + ann: Expression<'a>, + rhs: Option<(TokenRef<'a>, Expression<'a>)>, +) -> AnnAssign<'a> { + let annotation = make_annotation(col, ann); + let (eq, value) = rhs.map(|(x, y)| (Some(x), Some(y))).unwrap_or((None, None)); + let equal = eq.map(make_assign_equal); + AnnAssign { + target, + annotation, + value, + equal, + semicolon: None, + } +} + +fn make_yield<'a>( + yield_tok: TokenRef<'a>, + f: Option>, + e: Option>, +) -> Yield<'a> { + let value = match (f, e) { + (None, None) => None, + (Some(f), Some(e)) => Some(YieldValue::From(make_from(f, e))), + (None, Some(e)) => Some(YieldValue::Expression(e)), + _ => panic!("yield from without expression"), + }; + Yield { + value: value.map(Box::new), + lpar: Default::default(), + rpar: Default::default(), + whitespace_after_yield: Default::default(), + yield_tok, + } +} + +fn make_from<'a>(tok: TokenRef<'a>, e: Expression<'a>) -> From<'a> { + From { + item: e, + whitespace_before_from: Default::default(), + whitespace_after_from: Default::default(), + tok, + } +} + +fn make_return<'a>(return_tok: TokenRef<'a>, value: Option>) -> Return<'a> { + Return { + value, + whitespace_after_return: Default::default(), + semicolon: Default::default(), + return_tok, + } +} + +fn make_assert<'a>( + assert_tok: TokenRef<'a>, + test: Expression<'a>, + rest: Option<(Comma<'a>, Expression<'a>)>, +) -> Assert<'a> { + let (comma, msg) = if let Some((c, msg)) = rest { + (Some(c), Some(msg)) + } else { + (None, None) + }; + + Assert { + test, + msg, + comma, + whitespace_after_assert: Default::default(), + semicolon: Default::default(), + assert_tok, + } +} + +fn make_raise<'a>( + raise_tok: TokenRef<'a>, + exc: Option>, + rest: Option<(TokenRef<'a>, Expression<'a>)>, +) -> Raise<'a> { + let cause = rest.map(|(t, e)| make_from(t, e)); + + Raise { + exc, + cause, + whitespace_after_raise: Default::default(), + semicolon: Default::default(), + raise_tok, + } +} + +fn make_global<'a>( + tok: TokenRef<'a>, + init: Vec<(Name<'a>, Comma<'a>)>, + last: Name<'a>, +) -> Global<'a> { + let mut names: Vec> = init + .into_iter() + .map(|(name, c)| NameItem { + name, + comma: Some(c), + }) + .collect(); + names.push(NameItem { + name: last, + comma: None, + }); + Global { + names, + whitespace_after_global: Default::default(), + semicolon: Default::default(), + tok, + } +} + +fn make_nonlocal<'a>( + tok: TokenRef<'a>, + init: Vec<(Name<'a>, Comma<'a>)>, + last: Name<'a>, +) -> Nonlocal<'a> { + let mut names: Vec> = init + .into_iter() + .map(|(name, c)| NameItem { + name, + comma: Some(c), + }) + .collect(); + names.push(NameItem { + name: last, + comma: None, + }); + Nonlocal { + names, + whitespace_after_nonlocal: Default::default(), + semicolon: Default::default(), + tok, + } +} + +#[allow(clippy::too_many_arguments)] +fn make_for<'a>( + async_tok: Option>, + for_tok: TokenRef<'a>, + target: AssignTargetExpression<'a>, + in_tok: TokenRef<'a>, + iter: Expression<'a>, + colon_tok: TokenRef<'a>, + body: Suite<'a>, + orelse: Option>, +) -> For<'a> { + let asynchronous = async_tok.as_ref().map(|_| Asynchronous { + whitespace_after: Default::default(), + }); + + For { + target, + iter, + body, + orelse, + asynchronous, + leading_lines: Default::default(), + whitespace_after_for: Default::default(), + whitespace_before_in: Default::default(), + whitespace_after_in: Default::default(), + whitespace_before_colon: Default::default(), + async_tok, + for_tok, + in_tok, + colon_tok, + } +} + +fn make_while<'a>( + while_tok: TokenRef<'a>, + test: Expression<'a>, + colon_tok: TokenRef<'a>, + body: Suite<'a>, + orelse: Option>, +) -> While<'a> { + While { + test, + body, + orelse, + leading_lines: Default::default(), + whitespace_after_while: Default::default(), + whitespace_before_colon: Default::default(), + while_tok, + colon_tok, + } +} + +fn make_await<'a>(await_tok: TokenRef<'a>, expression: Expression<'a>) -> Await<'a> { + Await { + expression: Box::new(expression), + lpar: Default::default(), + rpar: Default::default(), + whitespace_after_await: Default::default(), + await_tok, + } +} + +fn make_class_def<'a>( + class_tok: TokenRef<'a>, + name: Name<'a>, + args: Option<(LeftParen<'a>, Option>>, RightParen<'a>)>, + colon_tok: TokenRef<'a>, + body: Suite<'a>, +) -> std::result::Result, &'static str> { + let mut bases = vec![]; + let mut keywords = vec![]; + let mut parens_tok = None; + let mut lpar = None; + let mut rpar = None; + + if let Some((lpar_, args, rpar_)) = args { + parens_tok = Some((lpar_.lpar_tok.clone(), rpar_.rpar_tok.clone())); + lpar = Some(lpar_); + rpar = Some(rpar_); + if let Some(args) = args { + let mut current_arg = &mut bases; + let mut seen_keyword = false; + for arg in args { + if arg.star == "**" || arg.keyword.is_some() { + current_arg = &mut keywords; + seen_keyword = true; + } + if seen_keyword + && (arg.star == "*" || (arg.star.is_empty() && arg.keyword.is_none())) + { + return Err("Positional argument follows keyword argument"); + } + // TODO: libcst-python does validation here + current_arg.push(arg); + } + } + } + Ok(ClassDef { + name, + body, + bases, + keywords, + decorators: vec![], + lpar, + rpar, + leading_lines: Default::default(), + lines_after_decorators: Default::default(), + whitespace_after_class: Default::default(), + whitespace_after_name: Default::default(), + whitespace_before_colon: Default::default(), + class_tok, + parens_tok, + colon_tok, + }) +} + +fn make_string(tok: TokenRef) -> String { + String::Simple(SimpleString { + value: tok.string, + ..Default::default() + }) +} + +fn make_strings<'a>(s: Vec<(String<'a>, TokenRef<'a>)>) -> String<'a> { + let mut strings = s.into_iter().rev(); + let (first, _) = strings.next().expect("no strings to make a string of"); + strings.fold(first, |acc, (str, tok)| { + let ret: String<'a> = String::Concatenated(ConcatenatedString { + left: Box::new(str), + right: Box::new(acc), + whitespace_between: Default::default(), + lpar: Default::default(), + rpar: Default::default(), + right_tok: tok, + }); + ret + }) +} + +fn make_fstring_expression<'a>( + lbrace_tok: TokenRef<'a>, + expression: Expression<'a>, + eq: Option>, + conversion_pair: Option<(TokenRef<'a>, &'a str)>, + format_pair: Option<(TokenRef<'a>, Vec>)>, + rbrace_tok: TokenRef<'a>, +) -> FormattedStringExpression<'a> { + let equal = eq.map(make_assign_equal); + let (conversion_tok, conversion) = if let Some((t, c)) = conversion_pair { + (Some(t), Some(c)) + } else { + (None, None) + }; + let (format_tok, format_spec) = if let Some((t, f)) = format_pair { + (Some(t), Some(f)) + } else { + (None, None) + }; + let after_expr_tok = if equal.is_some() { + None + } else if let Some(tok) = conversion_tok { + Some(tok) + } else if let Some(tok) = format_tok { + Some(tok) + } else { + Some(rbrace_tok) + }; + + FormattedStringExpression { + expression, + conversion, + format_spec, + whitespace_before_expression: Default::default(), + whitespace_after_expression: Default::default(), + equal, + lbrace_tok, + after_expr_tok, + } +} + +fn make_fstring<'a>( + start: &'a str, + parts: Vec>, + end: &'a str, +) -> FormattedString<'a> { + FormattedString { + start, + parts, + end, + lpar: Default::default(), + rpar: Default::default(), + } +} + +fn make_finally<'a>( + finally_tok: TokenRef<'a>, + colon_tok: TokenRef<'a>, + body: Suite<'a>, +) -> Finally<'a> { + Finally { + body, + leading_lines: Default::default(), + whitespace_before_colon: Default::default(), + finally_tok, + colon_tok, + } +} + +fn make_except<'a>( + except_tok: TokenRef<'a>, + exp: Option>, + as_: Option<(TokenRef<'a>, Name<'a>)>, + colon_tok: TokenRef<'a>, + body: Suite<'a>, +) -> ExceptHandler<'a> { + // TODO: AsName should come from outside + let name = as_.map(|(x, y)| make_as_name(x, AssignTargetExpression::Name(y))); + ExceptHandler { + body, + r#type: exp, + name, + leading_lines: Default::default(), + whitespace_after_except: Default::default(), + whitespace_before_colon: Default::default(), + except_tok, + colon_tok, + } +} + +fn make_try<'a>( + try_tok: TokenRef<'a>, + body: Suite<'a>, + handlers: Vec>, + orelse: Option>, + finalbody: Option>, +) -> Try<'a> { + Try { + body, + handlers, + orelse, + finalbody, + leading_lines: Default::default(), + whitespace_before_colon: Default::default(), + try_tok, + } +} + +fn make_aug_op(tok: TokenRef) -> Result { + let whitespace_before = Default::default(); + let whitespace_after = Default::default(); + + Ok(match tok.string { + "+=" => AugOp::AddAssign { + whitespace_before, + whitespace_after, + tok, + }, + "-=" => AugOp::SubtractAssign { + whitespace_before, + whitespace_after, + tok, + }, + "*=" => AugOp::MultiplyAssign { + whitespace_before, + whitespace_after, + tok, + }, + "@=" => AugOp::MatrixMultiplyAssign { + whitespace_before, + whitespace_after, + tok, + }, + "/=" => AugOp::DivideAssign { + whitespace_before, + whitespace_after, + tok, + }, + "%=" => AugOp::ModuloAssign { + whitespace_before, + whitespace_after, + tok, + }, + "&=" => AugOp::BitAndAssign { + whitespace_before, + whitespace_after, + tok, + }, + "|=" => AugOp::BitOrAssign { + whitespace_before, + whitespace_after, + tok, + }, + "^=" => AugOp::BitXorAssign { + whitespace_before, + whitespace_after, + tok, + }, + "<<=" => AugOp::LeftShiftAssign { + whitespace_before, + whitespace_after, + tok, + }, + ">>=" => AugOp::RightShiftAssign { + whitespace_before, + whitespace_after, + tok, + }, + "**=" => AugOp::PowerAssign { + whitespace_before, + whitespace_after, + tok, + }, + "//=" => AugOp::FloorDivideAssign { + whitespace_before, + whitespace_after, + tok, + }, + _ => return Err(ParserError::OperatorError), + }) +} + +fn make_aug_assign<'a>( + target: AssignTargetExpression<'a>, + operator: AugOp<'a>, + value: Expression<'a>, +) -> AugAssign<'a> { + AugAssign { + target, + operator, + value, + semicolon: Default::default(), + } +} + +fn make_with_item<'a>( + item: Expression<'a>, + as_: Option>, + n: Option>, +) -> WithItem<'a> { + let asname = match (as_, n) { + (Some(as_), Some(n)) => Some(make_as_name(as_, n)), + (None, None) => None, + _ => panic!("as and name should be present or missing together"), + }; + WithItem { + item, + asname, + comma: Default::default(), + } +} + +fn make_with<'a>( + async_tok: Option>, + with_tok: TokenRef<'a>, + items: Vec>, + colon_tok: TokenRef<'a>, + body: Suite<'a>, +) -> With<'a> { + let asynchronous = async_tok.as_ref().map(|_| Asynchronous { + whitespace_after: Default::default(), + }); + With { + items, + body, + asynchronous, + leading_lines: Default::default(), + whitespace_after_with: Default::default(), + whitespace_before_colon: Default::default(), + async_tok, + with_tok, + colon_tok, + } +} + +fn make_del<'a>(tok: TokenRef<'a>, target: DelTargetExpression<'a>) -> Del<'a> { + Del { + target, + whitespace_after_del: Default::default(), + semicolon: Default::default(), + tok, + } +} + +fn make_del_tuple<'a>( + lpar: Option>, + elements: Vec>, + rpar: Option>, +) -> DelTargetExpression<'a> { + DelTargetExpression::Tuple(Tuple { + elements, + lpar: lpar.map(|x| vec![x]).unwrap_or_default(), + rpar: rpar.map(|x| vec![x]).unwrap_or_default(), + }) +} + +fn make_named_expr<'a>(name: Name<'a>, tok: TokenRef<'a>, expr: Expression<'a>) -> NamedExpr<'a> { + NamedExpr { + target: Box::new(Expression::Name(name)), + value: Box::new(expr), + lpar: Default::default(), + rpar: Default::default(), + whitespace_before_walrus: Default::default(), + whitespace_after_walrus: Default::default(), + walrus_tok: tok, + } +} diff --git a/native/libcst/src/parser/mod.rs b/native/libcst/src/parser/mod.rs new file mode 100644 index 000000000..76094afb9 --- /dev/null +++ b/native/libcst/src/parser/mod.rs @@ -0,0 +1,6 @@ +mod errors; +mod grammar; +mod numbers; + +pub use errors::ParserError; +pub use grammar::{python, Result}; diff --git a/native/libcst/src/parser/numbers.rs b/native/libcst/src/parser/numbers.rs new file mode 100644 index 000000000..abe230657 --- /dev/null +++ b/native/libcst/src/parser/numbers.rs @@ -0,0 +1,68 @@ +use once_cell::sync::Lazy; +use regex::Regex; + +use crate::{Expression, Float, Imaginary, Integer}; + +static HEX: &str = r"0[xX](?:_?[0-9a-fA-F])+"; +static BIN: &str = r"0[bB](?:_?[01])+"; +static OCT: &str = r"0[oO](?:_?[0-7])+"; +static DECIMAL: &str = r"(?:0(?:_?0)*|[1-9](?:_?[0-9])*)"; + +static INTEGER_RE: Lazy = Lazy::new(|| { + Regex::new(format!("^({}|{}|{}|{})$", HEX, BIN, OCT, DECIMAL).as_str()).expect("regex") +}); + +static EXPONENT: &str = r"[eE][-+]?[0-9](?:_?[0-9])*"; +// Note: these don't exactly match the python implementation (exponent is not included) +static POINT_FLOAT: &str = r"([0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?|\.[0-9](?:_?[0-9])*)"; +static EXP_FLOAT: &str = r"[0-9](?:_?[0-9])*"; + +static FLOAT_RE: Lazy = Lazy::new(|| { + Regex::new( + format!( + "^({}({})?|{}{})$", + POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT + ) + .as_str(), + ) + .expect("regex") +}); + +static IMAGINARY_RE: Lazy = Lazy::new(|| { + Regex::new( + format!( + r"^([0-9](?:_?[0-9])*[jJ]|({}({})?|{}{})[jJ])$", + POINT_FLOAT, EXPONENT, EXP_FLOAT, EXPONENT + ) + .as_str(), + ) + .expect("regex") +}); + +pub(crate) fn parse_number(raw: &str) -> Expression { + if INTEGER_RE.is_match(raw) { + Expression::Integer(Integer { + value: raw, + lpar: Default::default(), + rpar: Default::default(), + }) + } else if FLOAT_RE.is_match(raw) { + Expression::Float(Float { + value: raw, + lpar: Default::default(), + rpar: Default::default(), + }) + } else if IMAGINARY_RE.is_match(raw) { + Expression::Imaginary(Imaginary { + value: raw, + lpar: Default::default(), + rpar: Default::default(), + }) + } else { + Expression::Integer(Integer { + value: raw, + lpar: Default::default(), + rpar: Default::default(), + }) + } +} diff --git a/native/libcst/src/py.rs b/native/libcst/src/py.rs new file mode 100644 index 000000000..827e6b2d6 --- /dev/null +++ b/native/libcst/src/py.rs @@ -0,0 +1,25 @@ +use pyo3::prelude::*; + +#[pymodule] +#[pyo3(name = "native")] +pub fn libcst_native(_py: Python, m: &PyModule) -> PyResult<()> { + #[pyfn(m)] + fn parse_module(source: String, encoding: Option<&str>) -> PyResult { + let m = crate::parse_module(source.as_str(), encoding)?; + Python::with_gil(|py| Ok(m.into_py(py))) + } + + #[pyfn(m)] + fn parse_expression(source: String) -> PyResult { + let expr = crate::parse_expression(source.as_str())?; + Python::with_gil(|py| Ok(expr.into_py(py))) + } + + #[pyfn(m)] + fn parse_statement(source: String) -> PyResult { + let stm = crate::parse_statement(source.as_str())?; + Python::with_gil(|py| Ok(stm.into_py(py))) + } + + Ok(()) +} diff --git a/native/libcst/src/tokenizer/core/LICENSE b/native/libcst/src/tokenizer/core/LICENSE new file mode 100644 index 000000000..7e9199f06 --- /dev/null +++ b/native/libcst/src/tokenizer/core/LICENSE @@ -0,0 +1,46 @@ +PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2 + +1. This LICENSE AGREEMENT is between the Python Software Foundation +("PSF"), and the Individual or Organization ("Licensee") accessing and +otherwise using this software ("Python") in source or binary form and +its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, PSF hereby +grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, +analyze, test, perform and/or display publicly, prepare derivative works, +distribute, and otherwise use Python alone or in any derivative version, +provided, however, that PSF's License Agreement and PSF's notice of copyright, +i.e., "Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +2011, 2012, 2013, 2014, 2015 Python Software Foundation; All Rights Reserved" +are retained in Python alone or in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on +or incorporates Python or any part thereof, and wants to make +the derivative work available to others as provided herein, then +Licensee hereby agrees to include in any such work a brief summary of +the changes made to Python. + +4. PSF is making Python available to Licensee on an "AS IS" +basis. PSF MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR +IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, PSF MAKES NO AND +DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS +FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF PYTHON WILL NOT +INFRINGE ANY THIRD PARTY RIGHTS. + +5. PSF SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF PYTHON +FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS +A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING PYTHON, +OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material +breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any +relationship of agency, partnership, or joint venture between PSF and +Licensee. This License Agreement does not grant permission to use PSF +trademarks or trade name in a trademark sense to endorse or promote +products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using Python, Licensee +agrees to be bound by the terms and conditions of this License +Agreement. diff --git a/native/libcst/src/tokenizer/core/README.md b/native/libcst/src/tokenizer/core/README.md new file mode 100644 index 000000000..dfef60f4e --- /dev/null +++ b/native/libcst/src/tokenizer/core/README.md @@ -0,0 +1,2 @@ +Files in this directory are a derivative of CPython's tokenizer, and are +therefore available under the PSF license. diff --git a/native/libcst/src/tokenizer/core/mod.rs b/native/libcst/src/tokenizer/core/mod.rs new file mode 100644 index 000000000..7c9f06841 --- /dev/null +++ b/native/libcst/src/tokenizer/core/mod.rs @@ -0,0 +1,1144 @@ +// This implementation is Copyright (c) Facebook, Inc. and its affiliates. +// +// CPython 3.10.0a5 and the original C code this is based on is +// Copyright (c) 2001-2021 Python Software Foundation; All Rights Reserved +// +// Portions of this module (f-string splitting) are based on parso's tokenize.py, which is also PSF +// licensed. + +/// A port of CPython's tokenizer.c to Rust, with the following significant modifications: +/// +/// - PEP 263 (encoding detection) support isn't implemented. We depend on other code to do this for +/// us right now, and expect that the input is utf-8 by the time we see it. +/// +/// - Removed support for tokenizing from a file handle without reading the whole file in at once. +/// This significantly complicates parsing and memory is cheap, so we require that the whole file +/// is read in and converted to a unicode string before tokenization can begin. +/// +/// - Removed support for the interactive interpreter parsing mode. +/// +/// - Tweaked the `translate_newlines` functionality and moved most of it into TextPosition. `\r` +/// characters are no longer removed from the input buffer, so strings may contain `\r` characters +/// that should be normalized prior to being interpreted. +/// +/// - Added support for tracking more detailed position information via TextPosition. As a +/// consequence, consuming and then backing up a character (`tok_nextc`/`tok_backup`) is more +/// expensive, and we prefer to call `TextPosition::peek()` instead. +/// +/// - Removed support for tokenizing type comments. +/// +/// - Reduced the number of different supported token types to match what parso's tokenizer yields. +/// +/// - Uses some regular expressions. Regular expression are a good fit for a tokenizer, but we don't +/// use regular expressions everywhere because we can't generate as good of error messages with +/// them. +/// +/// - Added support for breaking apart f-strings into multiple tokens, matching Parso's tokenizer +/// behavior. CPython instead runs the parser recursively to parse f-strings. +/// +/// Also, in general, the code is less tightly optimized. The CPython implementation is crazy +/// optimized in ways that wouldn't translate well to rust (e.g. it parses the input utf-8 buffer as +/// raw bytes instead of unicode codepoints). +/// +/// The implementation should still be faster than any pure-Python implementation, and most +/// optimizations (avoiding string copies when slicing) carry over to Rust very well. +/// +/// Planned (not yet implemented) features: +/// +/// - Add more feature flags to more closely match the behavior of older versions of Python 3.x. +/// +/// - Support for a Python 2 mode that tokenizes Python 2.7 code and fails on certain new Python 3 +/// syntax that wasn't supported in 2.7. +/// +/// - Maybe add back support for tokenizing type comments? +/// +/// This implementation is tailored to LibCST's needs. If you're looking for a more general-purpose +/// pure-Rust Python parser, consider using [RustPython's parser][]. +/// +/// [RustPython's parser]: https://crates.io/crates/rustpython-parser +mod string_types; + +use once_cell::sync::Lazy; +use regex::Regex; +use std::cell::RefCell; +use std::cmp::Ordering; +use std::convert::TryInto; +use std::fmt::Debug; +use std::fmt::Formatter; +use std::rc::Rc; + +use crate::tokenizer::{ + core::string_types::{FStringNode, StringQuoteChar, StringQuoteSize}, + operators::OPERATOR_RE, + text_position::{TextPosition, TextPositionSnapshot}, + whitespace_parser::State as WhitespaceState, +}; + +/// The maximum number of indentation levels at any given point in time. CPython's tokenizer.c caps +/// this to avoid the complexity of allocating a dynamic array, but we're using a Vec, so it's not +/// necessary, but we're keeping it to maintain compatibility. +const MAX_INDENT: usize = 100; + +// MAX_CHAR should be std::char::MAX once assoc_char_consts is stablized. +// https://github.com/rust-lang/rust/issues/71763 +const MAX_CHAR: char = '\u{10ffff}'; + +static SPACE_TAB_FORMFEED_RE: Lazy = Lazy::new(|| Regex::new(r"\A[ \f\t]+").expect("regex")); +static ANY_NON_NEWLINE_RE: Lazy = Lazy::new(|| Regex::new(r"\A[^\r\n]+").expect("regex")); +static STRING_PREFIX_RE: Lazy = + Lazy::new(|| Regex::new(r"\A(?i)(u|[bf]r|r[bf]|r|b|f)").expect("regex")); +static POTENTIAL_IDENTIFIER_TAIL_RE: Lazy = + Lazy::new(|| Regex::new(r"\A([a-zA-Z0-9_]|[^\x00-\x7f])+").expect("regex")); +static DECIMAL_DOT_DIGIT_RE: Lazy = Lazy::new(|| Regex::new(r"\A\.[0-9]").expect("regex")); +static DECIMAL_TAIL_RE: Lazy = + Lazy::new(|| Regex::new(r"\A[0-9](_?[0-9])*").expect("regex")); +static HEXADECIMAL_TAIL_RE: Lazy = + Lazy::new(|| Regex::new(r"\A(_?[0-9a-fA-F])+").expect("regex")); +static OCTAL_TAIL_RE: Lazy = Lazy::new(|| Regex::new(r"\A(_?[0-7])+").expect("regex")); +static BINARY_TAIL_RE: Lazy = Lazy::new(|| Regex::new(r"\A(_?[01])+").expect("regex")); + +/// Used to verify identifiers when there's a non-ascii character in them. +// This changes across unicode revisions. We'd need to ship our own unicode tables to 100% match a +// given Python version's behavior. +static UNICODE_IDENTIFIER_RE: Lazy = + Lazy::new(|| Regex::new(r"\A[\p{XID_Start}_]\p{XID_Continue}*\z").expect("regex")); + +#[derive(Debug, Eq, PartialEq, Copy, Clone)] +pub enum TokType { + String, + Name, + Number, + Op, + Newline, + Indent, + Dedent, + Async, + Await, + // TODO; add support for these + #[allow(dead_code)] + FStringStart, + #[allow(dead_code)] + FStringString, + #[allow(dead_code)] + FStringEnd, + EndMarker, +} + +#[derive(Debug, thiserror::Error, Eq, PartialEq)] +pub enum TokError<'t> { + #[error("inconsistent mixing of tabs and spaces")] + TabSpace, + #[error("too many indentation levels")] + TooDeep, + #[error("no matching outer block for dedent")] + Dedent, + #[error("unexpected characters after a line continuation")] + LineContinuation, + #[error("unexpected end of file after a line continuation")] + LineContinuationEof, + #[error("{0:?} is not a valid identifier")] + BadIdentifier(&'t str), + #[error("invalid decimal literal")] + BadDecimal, + #[error( + "{}{}", + "leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal ", + "integers" + )] + BadDecimalLeadingZeros, + #[error("invalid hexadecimal literal")] + BadHexadecimal, + #[error("invalid octal literal")] + BadOctal, + #[error("invalid digit {0:?} in octal literal")] + BadOctalDigit(char), + #[error("invalid binary literal")] + BadBinary, + #[error("invalid digit {0:?} in binary literal")] + BadBinaryDigit(char), + #[error("unterminated string literal")] + UnterminatedString, + #[error("unterminated triple-quoted string literal")] + UnterminatedTripleQuotedString, + #[error("unmatched {0:?}")] + UnmatchedClosingParen(char), + #[error("Closing parenthesis {1:?} does not match opening parenthesis {0:?}")] + MismatchedClosingParen(char, char), + #[error("Closing parenthesis {1:?} does not match opening parenthesis {0:?} on line {2:}")] + MismatchedClosingParenOnLine(char, char, usize), + #[error("{0:?} is not a valid character in this position")] + BadCharacter(char), +} + +// Clone is used for async_hacks, which needs to speculatively look-ahead one token. +#[derive(Clone)] +pub struct TokState<'t> { + /// The full program's source code (similar to `tok->str` or `tok->buf` in the CPython source + /// code). We don't support reading the file line-by-line from a file handle like CPython does, + /// so this is the whole program pre-converted to utf-8. + pub text_pos: TextPosition<'t>, + /// Start of the most recently returned token. + pub start_pos: TextPositionSnapshot, + /// True after we've encountered an error or there's no more text to process. + done: bool, + /// How many spaces a tab counts as (always 8) + tab_size: usize, + /// How many spaces a tab counts as in alt_indent_stack (always 1) + alt_tab_size: usize, + /// Stack of indentation levels where a tab is counted as 8 characters, used for tracking + /// dedents. Length is current indentation level. Should never have more than MAX_INDENT + /// entries. + indent_stack: Vec, + /// Used to check that tabs and spaces are not mixed. + alt_indent_stack: Vec, + /// Beginning of line. True if at the beginning of a new line. + at_bol: bool, + /// The number of bytes at the beginning of the line, as measured by consume_bol_whitespace. + /// Used by libcst to capture (and then validate and parse) the indentation. + pub bol_width: usize, + /// Set by `consume_bol_whitespace`, true if the current line is blank. + blank_line: bool, + /// Pending intents (if > 0) or dedents (if < 0). Used when multiple tokens need to be produced + /// at once. + pending_indents: i32, + /// Length is `() [] {}` parenthesis nesting level. Used to allow free continuations inside + /// them. Stack entries are to verify that closing parenthesis match opening parenthesis. + /// Tuple is (character, lineno). + paren_stack: Vec<(char, usize)>, + /// Whether we're in a continuation line. + cont_line: bool, + + /// True if async/await aren't always keywords. + async_hacks: bool, + /// True if tokens are inside an 'async def' body. + async_def: bool, + /// Indentation level of the outermost 'async def'. + async_def_indent: usize, + /// True if the outermost 'async def' had at least one NEWLINE token after it. + async_def_nl: bool, + + /// Splits f-strings into multiple tokens instead of a STRING token if true. + /// + /// CPython doesn't directly split f-strings in the tokenizer (and therefore doesn't support + /// this option). Instead, when the parser encounters an f-string, it recursively re-runs the + /// tokenizer and parser. + /// + /// Supporting this at the tokenizer-level is pretty nasty and adds a lot of complexity. + /// Eventually, we should probably support this at the parser-level instead. + split_fstring: bool, + fstring_stack: Vec, + + missing_nl_before_eof: bool, +} + +pub struct TokConfig { + /// Used in Python 3.5 and 3.6. If enabled, async/await are sometimes keywords and sometimes + /// identifiers, depending on if they're being used in the context of an async function. This + /// breaks async comprehensions outside of async functions. + pub async_hacks: bool, + pub split_fstring: bool, + // Not currently supported: + // type_comments: bool, +} + +fn is_digit>>(ch: C) -> bool { + matches!(ch.into(), Some('0'..='9')) +} + +#[derive(Debug)] +enum NumberState { + StartDigit, + Fraction, + Exponent, + Imaginary, +} + +impl<'t> TokState<'t> { + pub fn new(text: &'t str, config: &TokConfig) -> Self { + let text_pos = TextPosition::new(text); + let start_pos = (&text_pos).into(); + Self { + text_pos, + start_pos, + done: false, + tab_size: 8, + alt_tab_size: 1, + indent_stack: Vec::new(), + alt_indent_stack: Vec::new(), + at_bol: true, + bol_width: 0, + blank_line: false, + pending_indents: 0, + paren_stack: Vec::new(), + cont_line: false, + async_hacks: config.async_hacks, + async_def: false, + async_def_indent: 0, + async_def_nl: false, + split_fstring: config.split_fstring, + fstring_stack: Vec::new(), + missing_nl_before_eof: text.is_empty() || text.as_bytes()[text.len() - 1] != b'\n', + } + } + + pub fn is_parenthesized(&self) -> bool { + !self.paren_stack.is_empty() + } + + /// Implementation of `next()`, wrapped by next() to allow for easier error handling. Roughly + /// equivalent to `tok_get` in the C source code. + fn next_inner(&mut self) -> Result> { + if self.split_fstring { + if let Some(tos) = self.fstring_stack.last() { + if !tos.is_in_expr() { + self.start_pos = (&self.text_pos).into(); + let is_in_format_spec = tos.is_in_format_spec(); + let is_raw_string = tos.is_raw_string; + if let Some(tok) = + self.maybe_consume_fstring_string(is_in_format_spec, is_raw_string)? + { + return Ok(tok); + } + if let Some(tok) = self.maybe_consume_fstring_end() { + return Ok(tok); + } + } + } + } + + // This will never consume a token, but it may set blank_line and it may set + // pending_indents. + self.consume_bol_whitespace()?; + + // Return pending indents/dedents + if let Some(t) = self.process_pending_indents() { + self.start_pos = (&self.text_pos).into(); + return Ok(t); + } + + self.maybe_close_async_def(); + + 'again: loop { + // Skip spaces + self.text_pos.consume(&*SPACE_TAB_FORMFEED_RE); + + // Skip comment, unless it's a type comment + if self.text_pos.peek() == Some('#') { + self.text_pos.consume(&*ANY_NON_NEWLINE_RE); + // type_comment is not supported + } + + // Set start of current token + self.start_pos = (&self.text_pos).into(); + + return match self.text_pos.peek() { + // Check for EOF now + None => { + if self.missing_nl_before_eof + && self.text_pos.byte_column_number() != self.bol_width + && !self.blank_line + { + self.at_bol = true; + self.missing_nl_before_eof = false; + Ok(TokType::Newline) + } else { + Ok(TokType::EndMarker) + } + } + + // Identifier (most frequent token!) + Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR) => { + self.consume_identifier_or_prefixed_string() + } + + // Newline + Some('\n') => { + self.text_pos.next(); + self.at_bol = true; + if self.split_fstring + && !self.fstring_stack.iter().all(|node| node.allow_multiline()) + { + Err(TokError::UnterminatedString) + } else if self.blank_line || !self.paren_stack.is_empty() { + // this newline doesn't count + // recurse (basically `goto nextline`) + self.next_inner() + } else { + self.cont_line = false; + if self.async_def { + self.async_def_nl = true; + } + Ok(TokType::Newline) + } + } + + // Ellipsis + Some('.') if self.text_pos.consume("...") => { + return Ok(TokType::Op); + } + + // Number starting with period + Some('.') if self.text_pos.matches(&*DECIMAL_DOT_DIGIT_RE) => { + self.consume_number(NumberState::Fraction) + } + + // Dot + Some('.') => { + self.text_pos.next(); + Ok(TokType::Op) + } + + // Number + Some('0'..='9') => self.consume_number(NumberState::StartDigit), + + // String + Some('\'') | Some('"') => self.consume_string(), + + // Line continuation + Some('\\') => { + self.text_pos.next(); + if let Some('\n') = self.text_pos.next() { + if self.text_pos.peek() == None { + Err(TokError::LineContinuationEof) + } else { + self.cont_line = true; + // Read next line + continue 'again; + } + } else { + Err(TokError::LineContinuation) + } + } + + Some(ch @ '(') | Some(ch @ '[') | Some(ch @ '{') => { + self.text_pos.next(); + if let Some(tos) = self.fstring_stack.last_mut() { + tos.open_parentheses(); + } + self.paren_stack.push((ch, self.text_pos.line_number())); + Ok(TokType::Op) + } + + Some(closing @ ')') | Some(closing @ ']') | Some(closing @ '}') => { + self.text_pos.next(); + if let Some(tos) = self.fstring_stack.last_mut() { + tos.close_parentheses(); + } + if let Some((opening, line_number)) = self.paren_stack.pop() { + match (opening, closing) { + ('(', ')') | ('[', ']') | ('{', '}') => Ok(TokType::Op), + _ => { + if line_number != self.text_pos.line_number() { + Err(TokError::MismatchedClosingParenOnLine( + opening, + closing, + line_number, + )) + } else { + Err(TokError::MismatchedClosingParen(opening, closing)) + } + } + } + } else { + Err(TokError::UnmatchedClosingParen(closing)) + } + } + + Some(':') + if self + .fstring_stack + .last() + .map(|tos| tos.parentheses_count - tos.format_spec_count == 1) + .unwrap_or(false) => + { + // N.B. This may capture the walrus operator and pass it to the formatter. + // That's intentional. PEP 572 says: "Assignment expressions inside of f-strings + // require parentheses." + // + // >>> f'{x:=10}' # Valid, passes '=10' to formatter + let tos = self + .fstring_stack + .last_mut() + .expect("fstring_stack is not empty"); + tos.format_spec_count += 1; + self.text_pos.next(); + Ok(TokType::Op) + } + + // Operator + Some(_) if self.text_pos.consume(&*OPERATOR_RE) => Ok(TokType::Op), + + // Bad character + // If nothing works, fall back to this error. CPython returns an OP in this case, + // and then just relies on the parser to generate a generic syntax error. + Some(ch) => Err(TokError::BadCharacter(ch)), + }; + } + } + + /// Consumes the whitespace (and comments) at the beginning of the line. May emit an error. Will + /// mutate `pending_indents`, so you must check `pending_indents` after calling this. + fn consume_bol_whitespace(&mut self) -> Result<(), TokError<'t>> { + self.blank_line = false; + if !self.at_bol { + return Ok(()); + } + + let mut col = 0; // column where tab counts as 8 characters + let mut altcol = 0; // column where tab counts as 1 character + self.at_bol = false; + self.bol_width = 0; + + // consume space, tab, and formfeed characters + loop { + match self.text_pos.peek() { + Some(' ') => { + col += 1; + altcol += 1; + self.bol_width += 1; + self.text_pos.next(); + } + Some('\t') => { + // Increment both col and altcol using different tab sizes. Tabs snap to the + // next multiple of self.tab_size. + col = (col / self.tab_size + 1) * self.tab_size; + // altcol will later be used for detecting mixed tabs and spaces. + altcol = (altcol / self.alt_tab_size + 1) * self.alt_tab_size; + self.bol_width += 1; + self.text_pos.next(); + } + // Control-L (formfeed) for emacs users + Some('\x0c') => { + col = 0; + altcol = 0; + self.bol_width += 1; + self.text_pos.next(); + } + _ => { + break; + } + } + } + + // Lines with only whitespace and/or comments and/or a line continuation character shouldn't + // affect the indentation and are not passed to the parser as NEWLINE tokens. + self.blank_line = matches!(self.text_pos.peek(), Some('#') | Some('\n') | Some('\\')); + + if self.blank_line || !self.paren_stack.is_empty() { + return Ok(()); + } + + let prev_col = self.indent_stack.last().unwrap_or(&0); + match col.cmp(prev_col) { + Ordering::Equal => { + // No change + if altcol != *self.alt_indent_stack.last().unwrap_or(&0) { + return Err(TokError::TabSpace); + } + } + Ordering::Greater => { + // col > prev_col + // Indent -- always one + if self.indent_stack.len() + 1 >= MAX_INDENT { + return Err(TokError::TooDeep); + } + // col > prev_col, therefore altcol > prev_altcol, unless there's badly mixed tabs + // and spaces + if altcol <= *self.alt_indent_stack.last().unwrap_or(&0) { + return Err(TokError::TabSpace); + } + // only emit indents if we're not at EOF + if self.text_pos.peek().is_some() { + self.pending_indents += 1; + self.indent_stack.push(col); + self.alt_indent_stack.push(altcol); + } + } + Ordering::Less => { + // c < prev_col + // Dedent -- any number, must be consistent + while matches!(self.indent_stack.last(), Some(&ind_cols) if col < ind_cols) { + self.pending_indents -= 1; + self.indent_stack.pop(); + self.alt_indent_stack.pop(); + } + if col != *self.indent_stack.last().unwrap_or(&0) { + return Err(TokError::Dedent); + } + if altcol != *self.alt_indent_stack.last().unwrap_or(&0) { + return Err(TokError::TabSpace); + } + } + } + + Ok(()) + } + + fn process_pending_indents(&mut self) -> Option { + if self.pending_indents != 0 { + if self.pending_indents < 0 { + self.pending_indents += 1; + Some(TokType::Dedent) + } else { + self.pending_indents -= 1; + Some(TokType::Indent) + } + } else { + None + } + } + + fn maybe_close_async_def(&mut self) { + // Check if we are closing an async function + if self.async_def + && !self.blank_line + // (This is irrelevant to the rust implementation which doesn't support type_comments + // yet, but the comment is preserved for posterity) + // Due to some implementation artifacts of type comments, a TYPE_COMMENT at the start of + // a function won't set an indentation level and it will produce a NEWLINE after it. To + // avoid spuriously ending an async function due to this, wait until we have some + // non-newline char in front of us. + // && self.text_pos.peek() == Some('\n') + && self.paren_stack.is_empty() + // There was a NEWLINE after ASYNC DEF, so we're past the signature. + && self.async_def_nl + // Current indentation level is less than where the async function was defined + && self.async_def_indent >= self.indent_stack.len() + { + self.async_def = false; + self.async_def_indent = 0; + self.async_def_nl = false; + } + } + + fn consume_identifier_or_prefixed_string(&mut self) -> Result> { + // Process the various legal combinations of b"", r"", u"", and f"". + if self.text_pos.consume(&*STRING_PREFIX_RE) { + if let Some('"') | Some('\'') = self.text_pos.peek() { + // We found a string, not an identifier. Bail! + if self.split_fstring + && self + .text_pos + .slice_from_start_pos(&self.start_pos) + .contains(&['f', 'F'][..]) + { + return self.consume_fstring_start(); + } else { + return self.consume_string(); + } + } + } else { + // the next character must be a potential identifier start, aka `[a-zA-Z_]|[^\x00-\x7f]` + let first_ch = self.text_pos.next(); + debug_assert!(matches!( + first_ch, + Some('a'..='z') | Some('A'..='Z') | Some('_') | Some('\u{80}'..=MAX_CHAR) + )); + } + self.text_pos.consume(&*POTENTIAL_IDENTIFIER_TAIL_RE); + let identifier_str = self.text_pos.slice_from_start_pos(&self.start_pos); + if !verify_identifier(identifier_str) { + // TODO: async/await + return Err(TokError::BadIdentifier(identifier_str)); + } + + let allow_async = !self.async_hacks || self.async_def; + match (identifier_str, allow_async) { + ("async", true) => Ok(TokType::Async), + ("await", true) => Ok(TokType::Await), + ("async", false) => { + // The current token is 'async' and async_hacks is enabled. + // Look ahead one token to see if that is 'def'. + // This clone is expensive, but modern code doesn't need async_hacks. + let mut lookahead_state = self.clone(); + if lookahead_state.next_inner() == Ok(TokType::Name) + && lookahead_state + .text_pos + .slice_from_start_pos(&lookahead_state.start_pos) + == "def" + { + self.async_def = true; + self.async_def_indent = self.indent_stack.len(); + Ok(TokType::Async) + } else { + Ok(TokType::Name) + } + } + _ => Ok(TokType::Name), + } + } + + fn consume_number(&mut self, state: NumberState) -> Result> { + // This is organized as a state machine. The match could also be rewritten into multiple + // functions, but this is closer to how the C code is written (with gotos). + match state { + NumberState::StartDigit => { + let start_digit_ch = self.text_pos.peek(); + debug_assert!(is_digit(start_digit_ch)); + + if start_digit_ch == Some('0') { + self.text_pos.next(); + match self.text_pos.peek() { + Some('x') | Some('X') => { + self.text_pos.next(); + if !self.text_pos.consume(&*HEXADECIMAL_TAIL_RE) + || self.text_pos.peek() == Some('_') + { + Err(TokError::BadHexadecimal) + } else { + Ok(TokType::Number) + } + } + Some('o') | Some('O') => { + self.text_pos.next(); + if !self.text_pos.consume(&*OCTAL_TAIL_RE) + || self.text_pos.peek() == Some('_') + { + return Err(TokError::BadOctal); + } + if let Some(next_ch) = self.text_pos.peek() { + if is_digit(next_ch) { + return Err(TokError::BadOctalDigit(next_ch)); + } + } + Ok(TokType::Number) + } + Some('b') | Some('B') => { + self.text_pos.next(); + if !self.text_pos.consume(&*BINARY_TAIL_RE) + || self.text_pos.peek() == Some('_') + { + return Err(TokError::BadBinary); + } + if let Some(next_ch) = self.text_pos.peek() { + if is_digit(next_ch) { + return Err(TokError::BadBinaryDigit(next_ch)); + } + } + Ok(TokType::Number) + } + _ => { + let mut nonzero = false; + // Maybe old-style octal. In any case, allow '0' as a literal + loop { + if self.text_pos.peek() == Some('_') { + self.text_pos.next(); + if !is_digit(self.text_pos.peek()) { + return Err(TokError::BadDecimal); + } + } + if self.text_pos.peek() != Some('0') { + break; + } + self.text_pos.next(); + } + if is_digit(self.text_pos.peek()) { + nonzero = true; + self.consume_decimal_tail()?; + } + if self.text_pos.peek() == Some('.') { + self.consume_number(NumberState::Fraction) + } else if let Some('e') | Some('E') = self.text_pos.peek() { + self.consume_number(NumberState::Exponent) + } else if let Some('j') | Some('J') = self.text_pos.peek() { + self.consume_number(NumberState::Imaginary) + } else if nonzero { + Err(TokError::BadDecimalLeadingZeros) + } else { + Ok(TokType::Number) + } + } + } + } else { + self.consume_decimal_tail()?; + if self.text_pos.peek() == Some('.') { + self.consume_number(NumberState::Fraction) + } else if let Some('e') | Some('E') = self.text_pos.peek() { + self.consume_number(NumberState::Exponent) + } else if let Some('j') | Some('J') = self.text_pos.peek() { + self.consume_number(NumberState::Imaginary) + } else { + Ok(TokType::Number) + } + } + } + NumberState::Fraction => { + let dot_ch = self.text_pos.next(); + debug_assert!(dot_ch == Some('.')); + + if is_digit(self.text_pos.peek()) { + self.consume_decimal_tail()?; + } + if let Some('e') | Some('E') = self.text_pos.peek() { + self.consume_number(NumberState::Exponent) + } else if let Some('j') | Some('J') = self.text_pos.peek() { + self.consume_number(NumberState::Imaginary) + } else { + Ok(TokType::Number) + } + } + NumberState::Exponent => { + let e_ch = self.text_pos.next(); + debug_assert!(matches!(e_ch, Some('e') | Some('E'))); + + if let Some('+') | Some('-') = self.text_pos.peek() { + self.text_pos.next(); + if !is_digit(self.text_pos.peek()) { + return Err(TokError::BadDecimal); + } + } else if !is_digit(self.text_pos.peek()) { + // Don't consume the 'e'. It could be part of an identifier after this number. + self.text_pos.backup_no_newline(); + return Ok(TokType::Number); + } + self.consume_decimal_tail()?; + if let Some('j') | Some('J') = self.text_pos.peek() { + self.consume_number(NumberState::Imaginary) + } else { + Ok(TokType::Number) + } + } + NumberState::Imaginary => { + let j_ch = self.text_pos.next(); + debug_assert!(matches!(j_ch, Some('j') | Some('J'))); + + Ok(TokType::Number) + } + } + } + + /// Processes a decimal tail. This is the bit after the dot or after an E in a float. + fn consume_decimal_tail(&mut self) -> Result<(), TokError<'t>> { + let result = self.text_pos.consume(&*DECIMAL_TAIL_RE); + // Assumption: If we've been called, the first character is an integer, so we must have a + // regex match + debug_assert!(result, "try_decimal_tail was called on a non-digit char"); + if self.text_pos.peek() == Some('_') { + Err(TokError::BadDecimal) + } else { + Ok(()) + } + } + + fn consume_open_quote(&mut self) -> (StringQuoteChar, StringQuoteSize) { + let quote_char: StringQuoteChar = self + .text_pos + .peek() + .try_into() + .expect("the next character must be a quote when calling consume_open_quote"); + let triple_quote_pattern = quote_char.triple_str(); + let quote_size = if self.text_pos.consume(triple_quote_pattern) { + StringQuoteSize::Triple + } else { + self.text_pos.next(); // consume the single character instead + StringQuoteSize::Single + }; + (quote_char, quote_size) + } + + fn consume_string(&mut self) -> Result> { + // Assumption: The opening quote has not been consumed. Leading characters (b, r, f, etc) + // have been consumed. + let (quote_char, quote_size) = self.consume_open_quote(); + let quote_raw = quote_char.into(); + + let mut end_quote_size: usize = 0; + let quote_usize: usize = quote_size.into(); + while end_quote_size != quote_usize { + match (self.text_pos.next(), quote_size) { + (None, StringQuoteSize::Triple) => { + return Err(TokError::UnterminatedTripleQuotedString); + } + (None, StringQuoteSize::Single) | (Some('\n'), StringQuoteSize::Single) => { + return Err(TokError::UnterminatedString); + } + (ch @ Some('\''), _) | (ch @ Some('"'), _) if ch == Some(quote_raw) => { + end_quote_size += 1; + } + (Some(ch), _) => { + end_quote_size = 0; + if ch == '\\' { + // skip escaped char + self.text_pos.next(); + } + } + } + } + + Ok(TokType::String) + } + + fn consume_fstring_start(&mut self) -> Result> { + let (quote_char, quote_size) = self.consume_open_quote(); + let is_raw_string = self + .text_pos + .slice_from_start_pos(&self.start_pos) + .contains(&['r', 'R'][..]); + self.fstring_stack + .push(FStringNode::new(quote_char, quote_size, is_raw_string)); + Ok(TokType::FStringStart) + } + + fn maybe_consume_fstring_string( + &mut self, + is_in_format_spec: bool, + is_raw_string: bool, + ) -> Result, TokError<'t>> { + let allow_multiline = self.fstring_stack.iter().all(|node| node.allow_multiline()); + let mut in_named_unicode: bool = false; + let mut ok_result = Ok(None); // value to return if we reach the end and don't error out + 'outer: loop { + match (self.text_pos.peek(), allow_multiline) { + (None, true) => { + return Err(TokError::UnterminatedTripleQuotedString); + } + (None, false) | (Some('\n'), false) => { + return Err(TokError::UnterminatedString); + } + (ch @ Some('\''), _) | (ch @ Some('"'), _) => { + // see if this actually terminates something in fstring_stack + for node in self.fstring_stack.iter() { + if ch == Some(node.quote_char.into()) { + match node.quote_size { + StringQuoteSize::Single => { + break 'outer; + } + StringQuoteSize::Triple => { + if self.text_pos.matches(node.quote_char.triple_str()) { + break 'outer; + } + } + } + } + } + self.text_pos.next(); + } + (Some('\\'), _) if !is_raw_string => { + self.text_pos.next(); + if is_in_format_spec { + if let Some('{') | Some('}') = self.text_pos.peek() { + // don't consume { or } because we want those to be interpreted as OP + // tokens + } else { + // skip escaped char (e.g. \', \", or newline/line continuation) + self.text_pos.next(); + } + } else { + // skip escaped char + let next_ch = self.text_pos.next(); + // check if this is a \N sequence + if let Some('N') = next_ch { + // swallow the next open curly brace if it exists + if let Some('{') = self.text_pos.peek() { + in_named_unicode = true; + self.text_pos.next(); + } + } + } + } + (Some('{'), _) => { + if is_in_format_spec { + // don't actually consume the {, and generate an OP for it instead + break 'outer; + } + let consumed_double = self.text_pos.consume("{{"); + if !consumed_double { + break 'outer; + } + } + (Some('}'), _) => { + if in_named_unicode { + in_named_unicode = false; + self.text_pos.next(); + } else if is_in_format_spec { + // don't actually consume the }, and generate an OP for it instead + break 'outer; + } else if !self.text_pos.consume("}}") { + return Err(TokError::UnmatchedClosingParen('}')); + } + } + _ => { + self.text_pos.next(); + } + } + ok_result = Ok(Some(TokType::FStringString)); + } + ok_result + } + + fn maybe_consume_fstring_end(&mut self) -> Option { + let ch = self.text_pos.peek(); + let mut match_idx = None; + for (idx, node) in self.fstring_stack.iter().enumerate() { + if ch == Some(node.quote_char.into()) { + if node.quote_size == StringQuoteSize::Triple { + if self.text_pos.consume(node.quote_char.triple_str()) { + match_idx = Some(idx); + break; + } + } else { + self.text_pos.next(); // already matched + match_idx = Some(idx); + break; + } + } + } + if let Some(match_idx) = match_idx { + self.fstring_stack.truncate(match_idx); + Some(TokType::FStringEnd) + } else { + None + } + } +} + +impl<'t> Iterator for TokState<'t> { + type Item = Result>; + + /// Returns the next token type. + fn next(&mut self) -> Option>> { + // This implementation wraps `next_inner`, which does the actual work. + if self.done { + None + } else { + match self.next_inner() { + Err(err) => { + self.done = true; + Some(Err(err)) + } + Ok(TokType::EndMarker) => { + self.done = true; + Some(Ok(TokType::EndMarker)) + } + Ok(t) => Some(Ok(t)), + } + } + } +} + +/// Returns true if the given string is a valid Python 3.x identifier. Follows [PEP 3131][]. +/// +/// [PEP 3131]: https://www.python.org/dev/peps/pep-3131/ +fn verify_identifier(name: &str) -> bool { + // TODO: If `name` is non-ascii, must first normalize name to NFKC. + // Common case: If the entire string is ascii, we can avoid the more expensive regex check, + // since the tokenizer already validates ascii characters before calling us. + name.is_ascii() || UNICODE_IDENTIFIER_RE.is_match(name) +} + +#[derive(Clone)] +pub struct Token<'a> { + pub r#type: TokType, + pub string: &'a str, + pub start_pos: TextPositionSnapshot, + pub end_pos: TextPositionSnapshot, + pub whitespace_before: Rc>>, + pub whitespace_after: Rc>>, + pub relative_indent: Option<&'a str>, +} + +impl<'a> Debug for Token<'a> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { + write!( + f, + "Token({:?}, {}, start={:?}, end={:?}, relative_indent={:?}, ws_before={:?}, ws_after={:?}", + self.r#type, self.string, self.start_pos, self.end_pos, self.relative_indent, self.whitespace_before, self.whitespace_after + ) + } +} + +// Dummy Eq implementation. We never compare Tokens like this +impl<'a> PartialEq for Token<'a> { + fn eq(&self, _other: &Self) -> bool { + true + } +} + +impl<'a> Eq for Token<'a> {} + +pub struct TokenIterator<'a> { + previous_whitespace: Option>>>, + core_state: TokState<'a>, + absolute_indents: Vec<&'a str>, +} + +impl<'a> TokenIterator<'a> { + pub fn new(module_text: &'a str, config: &TokConfig) -> Self { + Self { + previous_whitespace: None, + absolute_indents: vec![], + core_state: TokState::new(module_text, config), + } + } +} + +impl<'a> Iterator for TokenIterator<'a> { + type Item = Result, TokError<'a>>; + + fn next(&mut self) -> Option { + let next = self.core_state.next(); + next.as_ref()?; + Some((|| { + let tok_type = next.unwrap()?; + let relative_indent = match tok_type { + TokType::Indent => { + let end_idx = self.core_state.text_pos.byte_idx(); + let start_idx = end_idx - self.core_state.bol_width; + let absolute_indent = &self.core_state.text_pos.text()[start_idx..end_idx]; + let relative_indent = + if let Some(prev_absolute_indent) = self.absolute_indents.last() { + if let Some(ri) = absolute_indent.strip_prefix(prev_absolute_indent) { + ri + } else { + // TODO: return the correct exception type, improve error message + return Err(TokError::Dedent); + } + } else { + // there's no previous indent, absolute_indent is relative_indent + absolute_indent + }; + self.absolute_indents.push(absolute_indent); + // HACKY: mutate and fixup the previous whitespace state + if let Some(ws) = self.previous_whitespace.as_mut() { + ws.borrow_mut().absolute_indent = absolute_indent; + } + Some(relative_indent) + } + TokType::Dedent => { + self.absolute_indents.pop(); + // HACKY: mutate and fixup the previous whitespace state + if let Some(ws) = self.previous_whitespace.as_mut() { + ws.borrow_mut().absolute_indent = + self.absolute_indents.last().unwrap_or(&""); + } + None + } + _ => None, + }; + let text_pos = &self.core_state.text_pos; + let whitespace_before = self.previous_whitespace.clone().unwrap_or_default(); + let whitespace_after = match tok_type { + TokType::Indent | TokType::Dedent | TokType::EndMarker => whitespace_before.clone(), + _ => Rc::new(RefCell::new(WhitespaceState { + line: text_pos.line_number(), + column: text_pos.char_column_number(), + column_byte: text_pos.byte_column_number(), + byte_offset: text_pos.byte_idx(), + absolute_indent: self.absolute_indents.last().unwrap_or(&""), + is_parenthesized: self.core_state.is_parenthesized(), + })), + }; + self.previous_whitespace = Some(whitespace_after.clone()); + + Ok(Token { + r#type: tok_type, + string: text_pos.slice_from_start_pos(&self.core_state.start_pos), + start_pos: self.core_state.start_pos.clone(), + end_pos: text_pos.into(), + whitespace_after: whitespace_after.clone(), + whitespace_before: whitespace_before.clone(), + relative_indent, + }) + })()) + } +} diff --git a/native/libcst/src/tokenizer/core/string_types.rs b/native/libcst/src/tokenizer/core/string_types.rs new file mode 100644 index 000000000..0d14d1e83 --- /dev/null +++ b/native/libcst/src/tokenizer/core/string_types.rs @@ -0,0 +1,119 @@ +// This implementation is Copyright (c) Facebook, Inc. and its affiliates. +// +// CPython 3.10.0a5 and the original C code this is based on is +// Copyright (c) 2001-2021 Python Software Foundation; All Rights Reserved +// +// Portions of this module (f-string splitting) are based on parso's tokenize.py, which is also PSF +// licensed. + +/// Helper types for string processing in the core tokenizer. +use std::convert::TryFrom; + +use crate::tokenizer::text_position::TextPositionSnapshot; + +#[derive(Clone, Copy, Eq, PartialEq)] +pub enum StringQuoteSize { + Single, + Triple, +} + +impl From for usize { + fn from(qs: StringQuoteSize) -> Self { + match qs { + StringQuoteSize::Single => 1, + StringQuoteSize::Triple => 3, + } + } +} + +#[derive(Clone, Copy)] +pub enum StringQuoteChar { + Apostrophe, + DoubleQuote, +} + +impl StringQuoteChar { + pub fn triple_str(&self) -> &'static str { + match self { + Self::Apostrophe => "'''", + Self::DoubleQuote => "\"\"\"", + } + } +} + +impl From for char { + fn from(ch: StringQuoteChar) -> Self { + match ch { + StringQuoteChar::Apostrophe => '\'', + StringQuoteChar::DoubleQuote => '"', + } + } +} + +#[derive(Debug, thiserror::Error)] +#[error("{0:?} is not a valid string quote character")] +pub struct StringQuoteCharConversionError(Option); + +impl TryFrom> for StringQuoteChar { + type Error = StringQuoteCharConversionError; + + fn try_from(ch: Option) -> Result { + match ch { + Some('\'') => Ok(StringQuoteChar::Apostrophe), + Some('"') => Ok(StringQuoteChar::DoubleQuote), + _ => Err(StringQuoteCharConversionError(ch)), + } + } +} + +#[derive(Clone)] +pub struct FStringNode { + pub quote_char: StringQuoteChar, + pub quote_size: StringQuoteSize, + pub parentheses_count: usize, + pub string_start: Option, + // In the syntax there can be multiple format_spec's nested: {x:{y:3}} + pub format_spec_count: usize, + pub is_raw_string: bool, +} + +impl FStringNode { + pub fn new( + quote_char: StringQuoteChar, + quote_size: StringQuoteSize, + is_raw_string: bool, + ) -> Self { + Self { + quote_char, + quote_size, + parentheses_count: 0, + string_start: None, + format_spec_count: 0, + is_raw_string, + } + } + + pub fn open_parentheses(&mut self) { + self.parentheses_count += 1; + } + + pub fn close_parentheses(&mut self) { + self.parentheses_count -= 1; + if self.parentheses_count == 0 { + // No parentheses means that the format spec is also finished. + self.format_spec_count = 0; + } + } + + pub fn allow_multiline(&self) -> bool { + self.quote_size == StringQuoteSize::Triple + } + + pub fn is_in_expr(&self) -> bool { + self.parentheses_count > self.format_spec_count + } + + pub fn is_in_format_spec(&self) -> bool { + !self.is_in_expr() && self.format_spec_count > 0 + } +} diff --git a/native/libcst/src/tokenizer/debug_utils.rs b/native/libcst/src/tokenizer/debug_utils.rs new file mode 100644 index 000000000..8e646f8d9 --- /dev/null +++ b/native/libcst/src/tokenizer/debug_utils.rs @@ -0,0 +1,16 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::fmt; + +/// An empty struct that when writes "..." when using `fmt::Debug`. Useful for omitting fields when +/// using `fmt::Formatter::debug_struct`. +pub struct EllipsisDebug; + +impl fmt::Debug for EllipsisDebug { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("...") + } +} diff --git a/native/libcst/src/tokenizer/mod.rs b/native/libcst/src/tokenizer/mod.rs new file mode 100644 index 000000000..e900f511d --- /dev/null +++ b/native/libcst/src/tokenizer/mod.rs @@ -0,0 +1,15 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +mod core; +mod debug_utils; +mod operators; +mod text_position; +pub mod whitespace_parser; + +pub use self::core::*; + +#[cfg(test)] +mod tests; diff --git a/native/libcst/src/tokenizer/operators.rs b/native/libcst/src/tokenizer/operators.rs new file mode 100644 index 000000000..e3098cfa1 --- /dev/null +++ b/native/libcst/src/tokenizer/operators.rs @@ -0,0 +1,85 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. +// +// Part of this file is derived from the CPython documentation, which is available under the +// zero-clause BSD license. That license does not require that derivative works cite the original +// code or that we retain the original work's copyright information. +// https://docs.python.org/3/license.html#zero-clause-bsd-license-for-code-in-the-python-release-documentation + +use once_cell::sync::Lazy; +use regex::Regex; + +/// A list of strings that make up all the possible operators in a specific version of Python. +/// Derived from the [CPython's token documentation](https://docs.python.org/3/library/token.html). +pub const OPERATORS: &[&str] = &[ + "(", // LPAR + ")", // RPAR + "[", // LSQB + "]", // RSQB + ":", // COLON + ",", // COMMA + ";", // SEMI + "+", // PLUS + "-", // MINUS + "*", // STAR + "/", // SLASH + "|", // VBAR + "&", // AMPER + "<", // LESS + ">", // GREATER + "=", // EQUAL + ".", // DOT + "%", // PERCENT + "{", // LBRACE + "}", // RBRACE + "==", // EQEQUAL + "!=", // NOTEQUAL + "<=", // LESSEQUAL + ">=", // GREATEREQUAL + "~", // TILDE + "^", // CIRCUMFLEX + "<<", // LEFTSHIFT + ">>", // RIGHTSHIFT + "**", // DOUBLESTAR + "+=", // PLUSEQUAL + "-=", // MINEQUAL + "*=", // STAREQUAL + "/=", // SLASHEQUAL + "%=", // PERCENTEQUAL + "&=", // AMPEREQUAL + "|=", // VBAREQUAL + "^=", // CIRCUMFLEXEQUAL + "<<=", // LEFTSHIFTEQUAL + ">>=", // RIGHTSHIFTEQUAL + "**=", // DOUBLESTAREQUAL + "//", // DOUBLESLASH + "//=", // DOUBLESLASHEQUAL + "@", // AT + "@=", // ATEQUAL + "->", // RARROW + "...", // ELLIPSIS + ":=", // COLONEQUAL + // Not a real operator, but needed to support the split_fstring feature + "!", + // The fake operator added by PEP 401. Technically only valid if used with: + // + // from __future__ import barry_as_FLUFL + "<>", +]; + +pub static OPERATOR_RE: Lazy = Lazy::new(|| { + // sort operators so that we try to match the longest ones first + let mut sorted_operators: Box<[&str]> = OPERATORS.into(); + sorted_operators.sort_unstable_by_key(|op| usize::MAX - op.len()); + Regex::new(&format!( + r"\A({})", + sorted_operators + .iter() + .map(|op| regex::escape(op)) + .collect::>() + .join("|") + )) + .expect("regex") +}); diff --git a/native/libcst/src/tokenizer/tests.rs b/native/libcst/src/tokenizer/tests.rs new file mode 100644 index 000000000..d839e815a --- /dev/null +++ b/native/libcst/src/tokenizer/tests.rs @@ -0,0 +1,689 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +/// Tests for the functionality in `tokenize::core`. These tests are not part of the `core` module +/// because they're not a derivative work of CPython, and are therefore not subject to the PSF +/// license. +use crate::tokenizer::core::{TokConfig, TokError, TokState, TokType}; + +fn default_config() -> TokConfig { + TokConfig { + async_hacks: false, + split_fstring: false, + } +} + +fn tokenize_with_end_marker<'t>( + text: &'t str, + config: &TokConfig, +) -> Result, TokError<'t>> { + let mut result = Vec::new(); + let mut state = TokState::new(text, config); + while let Some(tok_type) = state.next() { + result.push(( + tok_type?, + state.text_pos.slice_from_start_pos(&state.start_pos), + )); + } + Ok(result) +} + +fn tokenize_all<'t>( + text: &'t str, + config: &TokConfig, +) -> Result, TokError<'t>> { + let mut result = tokenize_with_end_marker(text, config)?; + // Remove the EndMarker, since it's on every non-error token stream. + assert_eq!(result.pop().expect("EndMarker").0, TokType::EndMarker); + // Also remove fake newline at the end + if let Some((TokType::Newline, "")) = result.last() { + result.pop(); + } + Ok(result) +} + +#[test] +fn test_indentifier() { + assert_eq!( + tokenize_all("test input", &default_config()), + Ok(vec![(TokType::Name, "test"), (TokType::Name, "input")]) + ); + + assert_eq!( + tokenize_all("__with_underscores", &default_config()), + Ok(vec![(TokType::Name, "__with_underscores")]) + ); + + assert_eq!( + tokenize_all("{ends_with_op}", &default_config()), + Ok(vec![ + (TokType::Op, "{"), + (TokType::Name, "ends_with_op"), + (TokType::Op, "}") + ]) + ); + + assert_eq!( + tokenize_all("\u{0100}\u{0101}\u{0102}unicode", &default_config()), + Ok(vec![(TokType::Name, "\u{0100}\u{0101}\u{0102}unicode")]) + ); +} + +#[test] +fn test_async_await() { + // normally async/await are keywords + assert_eq!( + tokenize_all("async await", &default_config()), + Ok(vec![(TokType::Async, "async"), (TokType::Await, "await")]) + ); + + // with async_hacks, async/await are handled as identifiers by default + assert_eq!( + tokenize_all( + "async await", + &TokConfig { + async_hacks: true, + ..default_config() + } + ), + Ok(vec![(TokType::Name, "async"), (TokType::Name, "await")]) + ); + + // with async_hacks, async/await are handled as keywords in functions + assert_eq!( + tokenize_all( + "async def fn():\n await foo\nawait bar", + &TokConfig { + async_hacks: true, + ..default_config() + } + ), + Ok(vec![ + // this async is followed by a def, so it's converted to an Async + (TokType::Async, "async"), + (TokType::Name, "def"), + (TokType::Name, "fn"), + (TokType::Op, "("), + (TokType::Op, ")"), + (TokType::Op, ":"), + (TokType::Newline, "\n"), + (TokType::Indent, ""), + // this await is inside a function, and is converted into an Await + (TokType::Await, "await"), + (TokType::Name, "foo"), + (TokType::Newline, "\n"), + (TokType::Dedent, ""), + // this await is outside the function, and is turned into an identifier + (TokType::Name, "await"), + (TokType::Name, "bar") + ]) + ); +} + +#[test] +fn test_blankline() { + assert_eq!( + tokenize_all("\n \n\t\n\x0c\n\n", &default_config()), + Ok(vec![]) + ); +} + +#[test] +fn test_newline() { + assert_eq!( + tokenize_all("a\nb\rc\r\n", &default_config()), + Ok(vec![ + (TokType::Name, "a"), + (TokType::Newline, "\n"), + (TokType::Name, "b"), + (TokType::Newline, "\r"), + (TokType::Name, "c"), + (TokType::Newline, "\r\n") + ]) + ); +} + +#[test] +fn test_indent_dedent() { + assert_eq!( + tokenize_all("one\n two\n sameindent\n", &default_config()), + Ok(vec![ + (TokType::Name, "one"), + (TokType::Newline, "\n"), + (TokType::Indent, ""), + (TokType::Name, "two"), + (TokType::Newline, "\n"), + (TokType::Name, "sameindent"), + (TokType::Newline, "\n"), + (TokType::Dedent, "") + ]) + ); + + assert_eq!( + tokenize_all("one\n two\n \tthree\n", &default_config()), + Ok(vec![ + (TokType::Name, "one"), + (TokType::Newline, "\n"), + (TokType::Indent, ""), + (TokType::Name, "two"), + (TokType::Newline, "\n"), + (TokType::Indent, ""), + (TokType::Name, "three"), + (TokType::Newline, "\n"), + (TokType::Dedent, ""), + (TokType::Dedent, "") + ]) + ); + + // indentation decreases to a new (smaller) indentation level that wasn't on the stack + assert_eq!( + tokenize_all(" one\n two", &default_config()), + Err(TokError::Dedent), + ); + + // TabSpace error without change in indentation + assert_eq!( + tokenize_all(" one\n\ttwo\n", &default_config()), + Err(TokError::TabSpace), + ); + + // TabSpace error with increase in indentation + assert_eq!( + tokenize_all(" one\n\t\ttwo\n", &default_config()), + Err(TokError::TabSpace), + ); + + // TabSpace error with decrease in indentation + assert_eq!( + tokenize_all(" one\n \ttwo\n\tthree\n", &default_config()), + Err(TokError::TabSpace), + ); + + // this looks like a TabSpace error, but CPython allows it, so we should too + assert!(tokenize_all(" \tone\n\t two\n", &default_config()).is_ok()); +} + +#[test] +fn test_integer_decimal() { + assert_eq!( + tokenize_all("123456789", &default_config()), + Ok(vec![(TokType::Number, "123456789")]) + ); + + assert_eq!( + tokenize_all("1_2_3", &default_config()), + Ok(vec![(TokType::Number, "1_2_3")]) + ); + + // doesn't consume trailing underscores + assert_eq!( + tokenize_all("123_", &default_config()), + Err(TokError::BadDecimal), + ); +} + +#[test] +fn test_integer_leading_zeros() { + assert_eq!( + tokenize_all("000", &default_config()), + Ok(vec![(TokType::Number, "000")]) + ); + + assert_eq!( + tokenize_all("0_0_0", &default_config()), + Ok(vec![(TokType::Number, "0_0_0")]) + ); + + assert_eq!( + tokenize_all("00123", &default_config()), + Err(TokError::BadDecimalLeadingZeros) + ); +} + +#[test] +fn test_integer_hexadecimal() { + assert_eq!( + tokenize_all("0x00Aa12Ff", &default_config()), + Ok(vec![(TokType::Number, "0x00Aa12Ff")]), + ); + + assert_eq!( + tokenize_all("0x_1_2_3", &default_config()), + Ok(vec![(TokType::Number, "0x_1_2_3")]), + ); + + assert_eq!( + tokenize_all("0x123_", &default_config()), + Err(TokError::BadHexadecimal), + ); +} + +#[test] +fn test_integer_octal() { + assert_eq!( + tokenize_all("0o001234567", &default_config()), + Ok(vec![(TokType::Number, "0o001234567")]), + ); + + assert_eq!( + tokenize_all("0o_1_2_3", &default_config()), + Ok(vec![(TokType::Number, "0o_1_2_3")]), + ); + + assert_eq!( + tokenize_all("0o123_", &default_config()), + Err(TokError::BadOctal), + ); + + assert_eq!( + tokenize_all("0o789", &default_config()), + Err(TokError::BadOctalDigit('8')), + ); +} + +#[test] +fn test_integer_binary() { + assert_eq!( + tokenize_all("0b00101011", &default_config()), + Ok(vec![(TokType::Number, "0b00101011")]), + ); + + assert_eq!( + tokenize_all("0b_0_1_0_1", &default_config()), + Ok(vec![(TokType::Number, "0b_0_1_0_1")]), + ); + + assert_eq!( + tokenize_all("0b0101_", &default_config()), + Err(TokError::BadBinary), + ); + + assert_eq!( + tokenize_all("0b0123", &default_config()), + Err(TokError::BadBinaryDigit('2')), + ); +} + +#[test] +fn test_fraction() { + // fraction starting with a dot + assert_eq!( + tokenize_all(".5", &default_config()), + Ok(vec![(TokType::Number, ".5")]) + ); + + // fraction starting with a dot using E + assert_eq!( + tokenize_all(".5e9", &default_config()), + Ok(vec![(TokType::Number, ".5e9")]) + ); + + // fraction starting with a dot using J + assert_eq!( + tokenize_all(".5j", &default_config()), + Ok(vec![(TokType::Number, ".5j")]) + ); + + // fraction starting with a zero + assert_eq!( + tokenize_all("0.5", &default_config()), + Ok(vec![(TokType::Number, "0.5")]) + ); + + // fraction starting with a zero using E + assert_eq!( + tokenize_all("0.5e9", &default_config()), + Ok(vec![(TokType::Number, "0.5e9")]) + ); + + // fraction starting with a zero using J + assert_eq!( + tokenize_all("0.5j", &default_config()), + Ok(vec![(TokType::Number, "0.5j")]) + ); + + // fraction with underscores + assert_eq!( + tokenize_all("1_0.2_5", &default_config()), + Ok(vec![(TokType::Number, "1_0.2_5")]) + ); + + // underscores after the fraction are an error + assert_eq!( + tokenize_all(".5_", &default_config()), + Err(TokError::BadDecimal), + ); + + // doesn't consume underscores around the dot + assert_eq!( + tokenize_all("1_.25", &default_config()), + Err(TokError::BadDecimal), + ); + + // doesn't consume underscores around the dot + assert_eq!( + tokenize_all("1._25", &default_config()), + Ok(vec![(TokType::Number, "1."), (TokType::Name, "_25")]) + ); +} + +#[test] +fn test_string() { + // empty, single quote + assert_eq!( + tokenize_all("''", &default_config()), + Ok(vec![(TokType::String, "''")]), + ); + + // empty, double quote + assert_eq!( + tokenize_all(r#""""#, &default_config()), + Ok(vec![(TokType::String, r#""""#)]), + ); + + // simple string + assert_eq!( + tokenize_all("'test'", &default_config()), + Ok(vec![(TokType::String, "'test'")]), + ); + + // mixed quotes + assert_eq!( + tokenize_all(r#""test'"#, &default_config()), + Err(TokError::UnterminatedString), + ); + + // single quoted strings can contain double quotes, double quoted strings can contain single + // quotes + assert_eq!( + tokenize_all( + r#"'she said "hey"' "but he'd ignored her""#, + &default_config() + ), + Ok(vec![ + (TokType::String, r#"'she said "hey"'"#), + (TokType::String, r#""but he'd ignored her""#) + ]), + ); + + // escape characters + assert_eq!( + tokenize_all("'a\\b\\c\\d\\e\\'\\f\\g'", &default_config()), + Ok(vec![(TokType::String, "'a\\b\\c\\d\\e\\'\\f\\g'"),]), + ); + + // newline in the middle of a string causes an unterminated string + assert_eq!( + tokenize_all("'first\nsecond'", &default_config()), + Err(TokError::UnterminatedString), + ); + + // newlines can be escaped and are preserved in the output + assert_eq!( + tokenize_all("'first\\\nsecond\\\r\nthird\\\r'", &default_config()), + Ok(vec![(TokType::String, "'first\\\nsecond\\\r\nthird\\\r'"),]), + ); +} + +#[test] +fn test_string_triple_quoted() { + // empty, single quote + assert_eq!( + tokenize_all("''''''", &default_config()), + Ok(vec![(TokType::String, "''''''")]), + ); + + // empty, double quote + assert_eq!( + tokenize_all(r#""""""""#, &default_config()), + Ok(vec![(TokType::String, r#""""""""#)]), + ); + + // simple string with newlines + assert_eq!( + tokenize_all("'''\nmulti\rline\r\n'''", &default_config()), + Ok(vec![(TokType::String, "'''\nmulti\rline\r\n'''")]), + ); + + // unterminated string + assert_eq!( + tokenize_all( + "'''hey'there's''quotes'here, but not '' three'", + &default_config() + ), + Err(TokError::UnterminatedTripleQuotedString), + ); +} + +#[test] +fn test_string_prefix() { + // works with double-quoted string + assert_eq!( + tokenize_all(r#"b"""#, &default_config()), + Ok(vec![(TokType::String, r#"b"""#)]), + ); + + // works with triple-quoted string + assert_eq!( + tokenize_all("b'''test'''", &default_config()), + Ok(vec![(TokType::String, "b'''test'''")]), + ); + + // prefix can be capitalized + assert_eq!( + tokenize_all("B'' R'' U'' F''", &default_config()), + Ok(vec![ + (TokType::String, "B''"), + (TokType::String, "R''"), + (TokType::String, "U''"), + (TokType::String, "F''"), + ]), + ); + + // valid prefixes + assert_eq!( + tokenize_all("b'' r'' u'' f'' br'' fr'' rb'' rf''", &default_config()), + Ok(vec![ + (TokType::String, "b''"), + (TokType::String, "r''"), + (TokType::String, "u''"), + (TokType::String, "f''"), + (TokType::String, "br''"), + (TokType::String, "fr''"), + (TokType::String, "rb''"), + (TokType::String, "rf''"), + ]), + ); + + // invalid prefixes + assert_eq!( + tokenize_all("bb'' rr'' uu'' ff'' ur'' ub'' uf'' fb''", &default_config()), + Ok(vec![ + (TokType::Name, "bb"), + (TokType::String, "''"), + (TokType::Name, "rr"), + (TokType::String, "''"), + (TokType::Name, "uu"), + (TokType::String, "''"), + (TokType::Name, "ff"), + (TokType::String, "''"), + (TokType::Name, "ur"), + (TokType::String, "''"), + (TokType::Name, "ub"), + (TokType::String, "''"), + (TokType::Name, "uf"), + (TokType::String, "''"), + (TokType::Name, "fb"), + (TokType::String, "''"), + ]), + ); +} + +#[test] +fn test_split_fstring() { + let config = TokConfig { + split_fstring: true, + ..default_config() + }; + + assert_eq!( + tokenize_all("f''", &config), + Ok(vec![ + (TokType::FStringStart, "f'"), + (TokType::FStringEnd, "'"), + ]), + ); + + assert_eq!( + tokenize_all("f'{value}'", &config), + Ok(vec![ + (TokType::FStringStart, "f'"), + (TokType::Op, "{"), + (TokType::Name, "value"), + (TokType::Op, "}"), + (TokType::FStringEnd, "'"), + ]), + ); + + assert_eq!( + tokenize_all("f'{{just a string}}'", &config), + Ok(vec![ + (TokType::FStringStart, "f'"), + (TokType::FStringString, r"{{just a string}}"), + (TokType::FStringEnd, "'"), + ]), + ); + + assert_eq!( + tokenize_all(r"f'\N{Latin Small Letter A}'", &config), + Ok(vec![ + (TokType::FStringStart, "f'"), + (TokType::FStringString, r"\N{Latin Small Letter A}"), + (TokType::FStringEnd, "'"), + ]), + ); + + // format specifier + assert_eq!( + tokenize_all("f'result: {value:{width}.{precision}}'", &config), + Ok(vec![ + (TokType::FStringStart, "f'"), + (TokType::FStringString, "result: "), + (TokType::Op, "{"), + (TokType::Name, "value"), + (TokType::Op, ":"), + (TokType::Op, "{"), + (TokType::Name, "width"), + (TokType::Op, "}"), + (TokType::FStringString, "."), + (TokType::Op, "{"), + (TokType::Name, "precision"), + (TokType::Op, "}"), + (TokType::Op, "}"), + (TokType::FStringEnd, "'"), + ]), + ); + + // the walrus operator isn't valid unless parenthesized + assert_eq!( + tokenize_all("f'{a := b}'", &config), + Ok(vec![ + (TokType::FStringStart, "f'"), + (TokType::Op, "{"), + (TokType::Name, "a"), + (TokType::Op, ":"), + (TokType::FStringString, "= b"), + (TokType::Op, "}"), + (TokType::FStringEnd, "'"), + ]), + ); + + // once parenthesized, this is recognized as the walrus operator + assert_eq!( + tokenize_all("f'{(a := b)}'", &config), + Ok(vec![ + (TokType::FStringStart, "f'"), + (TokType::Op, "{"), + (TokType::Op, "("), + (TokType::Name, "a"), + (TokType::Op, ":="), + (TokType::Name, "b"), + (TokType::Op, ")"), + (TokType::Op, "}"), + (TokType::FStringEnd, "'"), + ]), + ); +} + +#[test] +fn test_operator() { + assert_eq!( + tokenize_all("= == * ** **= -> . .. ...", &default_config()), + Ok(vec![ + (TokType::Op, "="), + (TokType::Op, "=="), + (TokType::Op, "*"), + (TokType::Op, "**"), + (TokType::Op, "**="), + (TokType::Op, "->"), + (TokType::Op, "."), + (TokType::Op, "."), + (TokType::Op, "."), + (TokType::Op, "...") + ]), + ); +} + +#[test] +fn test_fake_newline() { + assert_eq!( + tokenize_with_end_marker("foo", &default_config()), + Ok(vec![ + (TokType::Name, "foo"), + (TokType::Newline, ""), + (TokType::EndMarker, "") + ]) + ); +} + +#[test] +fn test_no_fake_newline_for_empty_input() { + assert_eq!( + tokenize_with_end_marker("", &default_config()), + Ok(vec![(TokType::EndMarker, "")]) + ); +} + +#[test] +fn test_no_fake_newline_for_only_whitespaces() { + assert_eq!( + tokenize_with_end_marker(" ", &default_config()), + Ok(vec![(TokType::EndMarker, "")]) + ); +} + +#[test] +fn test_add_dedents_after_fake_newline() { + assert_eq!( + tokenize_with_end_marker("if 1:\n if 2:\n foo", &default_config()), + Ok(vec![ + (TokType::Name, "if"), + (TokType::Number, "1"), + (TokType::Op, ":"), + (TokType::Newline, "\n"), + (TokType::Indent, ""), + (TokType::Name, "if"), + (TokType::Number, "2"), + (TokType::Op, ":"), + (TokType::Newline, "\n"), + (TokType::Indent, ""), + (TokType::Name, "foo"), + (TokType::Newline, ""), + (TokType::Dedent, ""), + (TokType::Dedent, ""), + (TokType::EndMarker, "") + ]) + ); +} diff --git a/native/libcst/src/tokenizer/text_position/char_width.rs b/native/libcst/src/tokenizer/text_position/char_width.rs new file mode 100644 index 000000000..84f364052 --- /dev/null +++ b/native/libcst/src/tokenizer/text_position/char_width.rs @@ -0,0 +1,329 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +use std::str::Chars; + +#[derive(Debug, Eq, PartialEq)] +pub struct CharWidth { + pub byte_width: usize, + pub char_width: usize, + pub character: char, +} + +/// Iterates over characters (unicode codepoints) normalizing `'\r'` and `"\r\n"` to `'\n'`. Also +/// gives the width of each character, but `'\r\n'` is counted as 2 bytes and 2 characters instead +/// of one even after being normalized to '\n'. +#[derive(Clone)] +pub struct NewlineNormalizedCharWidths<'t> { + iter: Chars<'t>, + text: &'t str, + idx: usize, +} + +impl<'t> NewlineNormalizedCharWidths<'t> { + pub fn new(text: &'t str) -> Self { + Self { + text, + iter: text.chars(), + idx: 0, + } + } + + pub fn previous(&mut self) -> Option<::Item> { + // This function is called infrequently. + let mut back_iter = self.text[..self.idx].chars(); + let result = match back_iter.next_back() { + // Unlikely: \n, normalization *may* be needed + Some('\n') => { + // Peek at the previous character to see we're a `\r\n` sequence + match back_iter.next_back() { + Some('\r') => Some(CharWidth { + byte_width: '\r'.len_utf8() + '\n'.len_utf8(), + char_width: 2, + character: '\n', + }), + _ => Some(CharWidth { + byte_width: '\n'.len_utf8(), + char_width: 1, + character: '\n', + }), + } + } + // Unlikely: \r, normalization is needed + Some('\r') => Some(CharWidth { + byte_width: '\n'.len_utf8(), + char_width: 1, + character: '\n', + }), + // Common case: Not \r or \n, so no normalization is needed + Some(ch) => Some(CharWidth { + byte_width: ch.len_utf8(), + char_width: 1, + character: ch, + }), + // Unlikely: EOF + None => None, + }; + if let Some(r) = &result { + self.idx -= r.byte_width; + self.iter = self.text[self.idx..].chars(); + } + result + } + + pub fn peek_character(&self) -> Option { + // This function is called very frequently. + // + // We're not using peekable or caching here, since this should be cheap enough on it's own, + // though benchmarking might prove otherwise. + match self.iter.clone().next() { + Some('\r') => Some('\n'), + ch => ch, + } + } +} + +impl<'t> Iterator for NewlineNormalizedCharWidths<'t> { + type Item = CharWidth; + + fn next(&mut self) -> Option { + // This function is called very frequently. + let result = match self.iter.next() { + // Unlikely: \r, normalization is needed + Some('\r') => { + // Peek at the next character to see if it's '\n'. + let mut speculative = self.iter.clone(); + match speculative.next() { + Some('\n') => { + self.iter = speculative; + Some(CharWidth { + byte_width: '\r'.len_utf8() + '\n'.len_utf8(), + char_width: 2, + character: '\n', + }) + } + _ => Some(CharWidth { + byte_width: '\r'.len_utf8(), + char_width: 1, + character: '\n', + }), + } + } + // Common case: Not \r, so no normalization is needed + Some(ch) => Some(CharWidth { + byte_width: ch.len_utf8(), + char_width: 1, + character: ch, + }), + // Unlikely: EOF + None => None, + }; + if let Some(r) = &result { + self.idx += r.byte_width; + } + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ascii_no_newlines() { + let mut cw = NewlineNormalizedCharWidths::new("in"); + + // go forward + assert_eq!(cw.peek_character(), Some('i')); + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 'i' + }) + ); + assert_eq!(cw.peek_character(), Some('n')); + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 'n' + }) + ); + + // end of text + assert_eq!(cw.peek_character(), None); + assert_eq!(cw.next(), None); + + // go backwards + assert_eq!( + cw.previous(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 'n' + }) + ); + assert_eq!( + cw.previous(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 'i' + }) + ); + + // beginning of text + assert_eq!(cw.previous(), None); + + // try going foward again + assert_eq!(cw.peek_character(), Some('i')); + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 'i' + }) + ); + } + + #[test] + fn test_unicode_no_newlines() { + // "test" with an accented 'e' + let mut cw = NewlineNormalizedCharWidths::new("t\u{00e9}st"); + + // go forward + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 't' + }) + ); + assert_eq!(cw.peek_character(), Some('\u{00e9}')); + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 2, + char_width: 1, + character: '\u{00e9}' + }) + ); + assert_eq!(cw.peek_character(), Some('s')); + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 's' + }) + ); + + // go backwards + assert_eq!( + cw.previous(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 's' + }) + ); + assert_eq!( + cw.previous(), + Some(CharWidth { + byte_width: 2, + char_width: 1, + character: '\u{00e9}' + }) + ); + assert_eq!( + cw.previous(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: 't' + }) + ); + } + + #[test] + fn test_newlines() { + let mut cw = NewlineNormalizedCharWidths::new("\n\r\r\n"); + + // go forward + assert_eq!(cw.peek_character(), Some('\n')); + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: '\n' + }) + ); + assert_eq!(cw.peek_character(), Some('\n')); + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: '\n' + }) + ); + assert_eq!(cw.peek_character(), Some('\n')); + assert_eq!( + cw.next(), + Some(CharWidth { + byte_width: 2, + char_width: 2, + character: '\n' + }) + ); + + // end of text + assert_eq!(cw.peek_character(), None); + assert_eq!(cw.next(), None); + + // go backwards + assert_eq!( + cw.previous(), + Some(CharWidth { + byte_width: 2, + char_width: 2, + character: '\n' + }) + ); + assert_eq!( + cw.previous(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: '\n' + }) + ); + assert_eq!( + cw.previous(), + Some(CharWidth { + byte_width: 1, + char_width: 1, + character: '\n' + }) + ); + + // beginning of text + assert_eq!(cw.previous(), None); + } + + #[test] + fn test_empty() { + let mut cw = NewlineNormalizedCharWidths::new(""); + assert_eq!(cw.peek_character(), None); + assert_eq!(cw.next(), None); + assert_eq!(cw.previous(), None); + } +} diff --git a/native/libcst/src/tokenizer/text_position/mod.rs b/native/libcst/src/tokenizer/text_position/mod.rs new file mode 100644 index 000000000..b69054849 --- /dev/null +++ b/native/libcst/src/tokenizer/text_position/mod.rs @@ -0,0 +1,353 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +mod char_width; + +use once_cell::sync::Lazy; +use regex::Regex; +use std::fmt; + +use crate::tokenizer::debug_utils::EllipsisDebug; +use char_width::NewlineNormalizedCharWidths; + +static CR_OR_LF_RE: Lazy = Lazy::new(|| Regex::new(r"[\r\n]").expect("regex")); + +pub trait TextPattern { + fn match_len(&self, text: &str) -> Option; +} + +impl TextPattern for &Regex { + // make sure to anchor your regex with \A + fn match_len(&self, text: &str) -> Option { + self.find(text).map(|m| m.end()) + } +} + +impl TextPattern for &str { + // make sure to anchor your regex with \A + fn match_len(&self, text: &str) -> Option { + if text.starts_with(self) { + Some(self.len()) + } else { + None + } + } +} + +// This is Clone, since that's needed to support async_hacks, but you probably don't usually want to +// clone. Use TextPositionSnapshot instead. +#[derive(Clone)] +pub struct TextPosition<'t> { + text: &'t str, + char_widths: NewlineNormalizedCharWidths<'t>, + inner_byte_idx: usize, + inner_char_column_number: usize, + inner_byte_column_number: usize, + inner_line_number: usize, +} + +/// A lightweight immutable version of TextPosition that's slightly +/// cheaper to construct/store. Used for storing the start position of tokens. +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct TextPositionSnapshot { + pub inner_byte_idx: usize, + pub inner_char_column_number: usize, + pub inner_line_number: usize, +} + +impl TextPositionSnapshot { + pub fn byte_idx(&self) -> usize { + self.inner_byte_idx + } + + pub fn char_column_number(&self) -> usize { + self.inner_char_column_number + } + + pub fn line_number(&self) -> usize { + self.inner_line_number + } +} + +impl<'t> TextPosition<'t> { + pub fn new(text: &'t str) -> Self { + Self { + text, + char_widths: NewlineNormalizedCharWidths::new(text), + inner_byte_idx: 0, + inner_char_column_number: 0, + inner_byte_column_number: 0, + inner_line_number: 1, + } + } + + /// Peeks at the next character. Similar to `std::iter::Peekable`, but doesn't modify our + /// internal position counters like wrapping this in `Peekable` would. + pub fn peek(&mut self) -> Option<::Item> { + self.char_widths.peek_character() + } + + /// Matches, but does not consume TextPattern. + /// + /// Caution: This does not normalize `'\r'` characters, like `peek()` and `next()` do. + pub fn matches(&self, pattern: P) -> bool { + let rest_of_text = &self.text[self.inner_byte_idx..]; + let match_len = pattern.match_len(rest_of_text); + match match_len { + Some(match_len) => { + assert!( + !CR_OR_LF_RE.is_match(&rest_of_text[..match_len]), + "matches pattern must not match a newline", + ); + true + } + None => false, + } + } + + /// Moves the iterator back one character. Panics if a newline is encountered or if we try to + /// back up past the beginning of the text. + pub fn backup_no_newline(&mut self) { + if let Some(cw) = self.char_widths.previous() { + // If we tried to back up across a newline, we'd have to recompute char_column_number, + // which would be expensive, so it's unsupported. + self.inner_char_column_number = self + .inner_char_column_number + .checked_sub(1) + .expect("cannot back up past the beginning of a line."); + self.inner_byte_idx -= cw.byte_width; + } else { + panic!("Tried to backup past the beginning of the text.") + } + } + + /// Tries to consume the given TextPattern, moving the TextPosition forward. Returns false if no + /// match was found. Does not support newlines. + /// + /// Panics if a newline is consumed as part of the pattern. + pub fn consume(&mut self, pattern: P) -> bool { + let rest_of_text = &self.text[self.inner_byte_idx..]; + if let Some(len) = pattern.match_len(rest_of_text) { + let new_byte_idx = self.inner_byte_idx + len; + // Call next() a bunch of times to advance the character counters. There's no way to + // shortcut this because we don't know how many characters are in a slice of bytes, + // though we could use a faster algorithm that inspects multiple characters at once + // (e.g. SIMD). + while self.inner_byte_idx < new_byte_idx { + // We can't support newline normalization in this API without copying the string, so + // rather than exposing that (potentially dangerous) behavior, panic if it happens. + assert!( + self.next() != Some('\n'), + "consume pattern must not match a newline", + ); + } + // this shouldn't be possible for the provided implementations of TextPattern + debug_assert!( + self.inner_byte_idx == new_byte_idx, + "pattern ended on a non-character boundary", + ); + true + } else { + false + } + } + + pub fn text(&self) -> &'t str { + self.text + } + + pub fn slice_from_start_pos(&self, start_pos: &TextPositionSnapshot) -> &'t str { + &self.text[start_pos.byte_idx()..self.byte_idx()] + } + + /// Returns the number of bytes we've traversed. This is useful for Rust code that needs to + /// slice the input source code, since Rust slices operate on bytes and not unicode codepoints. + pub fn byte_idx(&self) -> usize { + self.inner_byte_idx + } + + /// Returns the column number in terms of number of characters (unicode codepoints) past the + /// beginning of the line. Zero-indexed. + pub fn char_column_number(&self) -> usize { + self.inner_char_column_number + } + + pub fn byte_column_number(&self) -> usize { + self.inner_byte_column_number + } + + /// Returns the one-indexed line number. + pub fn line_number(&self) -> usize { + self.inner_line_number + } +} + +impl Iterator for TextPosition<'_> { + type Item = char; + + /// Gets the next character. This has the side-effect of advancing the internal position + /// counters. + fn next(&mut self) -> Option { + if let Some(cw) = self.char_widths.next() { + self.inner_byte_idx += cw.byte_width; + match cw.character { + '\n' => { + self.inner_line_number += 1; + self.inner_char_column_number = 0; + self.inner_byte_column_number = 0; + } + _ => { + self.inner_char_column_number += cw.char_width; + self.inner_byte_column_number += cw.byte_width; + } + } + Some(cw.character) + } else { + None + } + } +} + +impl fmt::Debug for TextPosition<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TextPosition") + .field("text", &EllipsisDebug) + .field("char_widths", &EllipsisDebug) + .field("inner_byte_idx", &self.inner_byte_idx) + .field("inner_char_column_number", &self.inner_char_column_number) + .field("inner_line_number", &self.inner_line_number) + .finish() + } +} + +impl From<&TextPosition<'_>> for TextPositionSnapshot { + fn from(tp: &TextPosition) -> Self { + Self { + inner_byte_idx: tp.inner_byte_idx, + inner_char_column_number: tp.inner_char_column_number, + inner_line_number: tp.inner_line_number, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty() { + let mut pos = TextPosition::new(""); + assert_eq!(pos.byte_idx(), 0); + assert_eq!(pos.char_column_number(), 0); + assert_eq!(pos.line_number(), 1); + assert_eq!(pos.peek(), None); + assert!(!pos.consume(&Regex::new(r"\Awon't match").unwrap())); + assert!(pos.consume(&Regex::new(r"\A").unwrap())); + assert_eq!(pos.next(), None); + // call next() again to verify that it's fused + assert_eq!(pos.next(), None); + } + + #[test] + fn test_ascii() { + let mut pos = TextPosition::new("abcdefg"); + + assert_eq!(pos.peek(), Some('a')); + assert_eq!(pos.next(), Some('a')); + assert_eq!(pos.byte_idx(), 1); + assert_eq!(pos.char_column_number(), 1); + assert_eq!(pos.line_number(), 1); + + // consume a few characters with a regex + assert!(!pos.consume(&Regex::new(r"\Awon't match").unwrap())); + assert!(pos.consume(&Regex::new(r"\Abcd").unwrap())); + assert_eq!(pos.byte_idx(), 4); + assert_eq!(pos.char_column_number(), 4); + assert_eq!(pos.line_number(), 1); + + // consume the rest of the text + assert_eq!(pos.next(), Some('e')); + assert_eq!(pos.next(), Some('f')); + assert_eq!(pos.next(), Some('g')); + assert_eq!(pos.next(), None); + assert_eq!(pos.byte_idx(), 7); + assert_eq!(pos.char_column_number(), 7); + assert_eq!(pos.line_number(), 1); + } + + #[test] + fn test_unicode() { + let mut pos = TextPosition::new("\u{00e9}abc"); + + assert_eq!(pos.peek(), Some('\u{00e9}')); + assert_eq!(pos.next(), Some('\u{00e9}')); + } + + #[test] + fn test_newline_lf() { + let mut pos = TextPosition::new("ab\nde"); + + assert_eq!(pos.next(), Some('a')); + assert_eq!(pos.next(), Some('b')); + assert_eq!(pos.line_number(), 1); + assert_eq!(pos.char_column_number(), 2); + + assert_eq!(pos.next(), Some('\n')); + assert_eq!(pos.line_number(), 2); + assert_eq!(pos.char_column_number(), 0); + + assert_eq!(pos.next(), Some('d')); + assert_eq!(pos.next(), Some('e')); + assert_eq!(pos.next(), None); + assert_eq!(pos.line_number(), 2); + assert_eq!(pos.char_column_number(), 2); + + assert_eq!(pos.byte_idx(), 5); + } + + #[test] + fn test_newline_cr() { + let mut pos = TextPosition::new("ab\rde"); + + assert_eq!(pos.next(), Some('a')); + assert_eq!(pos.next(), Some('b')); + assert_eq!(pos.line_number(), 1); + assert_eq!(pos.char_column_number(), 2); + + assert_eq!(pos.next(), Some('\n')); + assert_eq!(pos.line_number(), 2); + assert_eq!(pos.char_column_number(), 0); + + assert_eq!(pos.next(), Some('d')); + assert_eq!(pos.next(), Some('e')); + assert_eq!(pos.next(), None); + assert_eq!(pos.line_number(), 2); + assert_eq!(pos.char_column_number(), 2); + + assert_eq!(pos.byte_idx(), 5); + } + + #[test] + fn test_newline_cr_lf() { + let mut pos = TextPosition::new("ab\r\nde"); + + assert_eq!(pos.next(), Some('a')); + assert_eq!(pos.next(), Some('b')); + assert_eq!(pos.line_number(), 1); + assert_eq!(pos.char_column_number(), 2); + + assert_eq!(pos.next(), Some('\n')); + assert_eq!(pos.line_number(), 2); + assert_eq!(pos.char_column_number(), 0); + + assert_eq!(pos.next(), Some('d')); + assert_eq!(pos.next(), Some('e')); + assert_eq!(pos.next(), None); + assert_eq!(pos.line_number(), 2); + assert_eq!(pos.char_column_number(), 2); + + assert_eq!(pos.byte_idx(), 6); + } +} diff --git a/native/libcst/src/tokenizer/whitespace_parser.rs b/native/libcst/src/tokenizer/whitespace_parser.rs new file mode 100644 index 000000000..e624bc3df --- /dev/null +++ b/native/libcst/src/tokenizer/whitespace_parser.rs @@ -0,0 +1,401 @@ +use crate::nodes::{ + Comment, EmptyLine, Fakeness, Newline, ParenthesizableWhitespace, ParenthesizedWhitespace, + SimpleWhitespace, TrailingWhitespace, +}; +use once_cell::sync::Lazy; +use regex::Regex; +use thiserror::Error; + +use crate::Token; + +use super::TokType; + +static SIMPLE_WHITESPACE_RE: Lazy = + Lazy::new(|| Regex::new(r"\A([ \f\t]|\\(\r\n?|\n))*").expect("regex")); +static NEWLINE_RE: Lazy = Lazy::new(|| Regex::new(r"\A(\r\n?|\n)").expect("regex")); +static COMMENT_RE: Lazy = Lazy::new(|| Regex::new(r"\A#[^\r\n]*").expect("regex")); + +#[allow(clippy::upper_case_acronyms, clippy::enum_variant_names)] +#[derive(Error, Debug, PartialEq, Eq)] +pub enum WhitespaceError { + #[error("WTF")] + WTF, + #[error("Internal error while parsing whitespace: {0}")] + InternalError(String), + #[error("Failed to parse mandatory trailing whitespace")] + TrailingWhitespaceError, +} + +type Result = std::result::Result; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct State<'a> { + pub line: usize, // one-indexed (to match parso's behavior) + pub column: usize, // zero-indexed (to match parso's behavior) + pub column_byte: usize, + pub absolute_indent: &'a str, + pub is_parenthesized: bool, + pub byte_offset: usize, +} + +impl<'a> Default for State<'a> { + fn default() -> Self { + Self { + line: 1, + column: 0, + column_byte: 0, + absolute_indent: "", + is_parenthesized: false, + byte_offset: 0, + } + } +} + +// TODO +pub struct Config<'a> { + pub input: &'a str, + pub lines: Vec<&'a str>, + pub default_newline: &'a str, + pub default_indent: &'a str, +} + +impl<'a> Config<'a> { + pub fn new(input: &'a str, tokens: &[Token<'a>]) -> Self { + let mut default_indent = " "; + for tok in tokens { + if tok.r#type == TokType::Indent { + default_indent = tok.relative_indent.unwrap(); + break; + } + } + let default_newline = Regex::new(r"\r\n?|\n") + .expect("regex") + .find(input) + .map(|m| m.as_str()) + .unwrap_or("\n"); + + Self { + input, + lines: input.split_inclusive(default_newline).collect(), + default_newline, + default_indent, + } + } + + pub fn has_trailing_newline(&self) -> bool { + self.input.ends_with('\n') + && !self.input.ends_with("\\\n") + && !self.input.ends_with("\\\r\n") + } + + fn get_line(&self, line_number: usize) -> Result<&'a str> { + let err_fn = || { + WhitespaceError::InternalError(format!( + "tried to get line {} which is out of range", + line_number + )) + }; + self.lines + .get(line_number.checked_sub(1).ok_or_else(err_fn)?) + .map(|l| &l[..]) + .ok_or_else(err_fn) + } + + fn get_line_after_column(&self, line_number: usize, column_index: usize) -> Result<&'a str> { + self.get_line(line_number)? + .get(column_index..) + .ok_or_else(|| { + WhitespaceError::InternalError(format!( + "Column index {} out of range for line {}", + column_index, line_number + )) + }) + } +} + +#[derive(Debug)] +enum ParsedEmptyLine<'a> { + NoIndent, + Line(EmptyLine<'a>), +} + +fn parse_empty_line<'a>( + config: &Config<'a>, + state: &mut State, + override_absolute_indent: Option<&'a str>, +) -> Result> { + let mut speculative_state = state.clone(); + if let Ok(indent) = parse_indent(config, &mut speculative_state, override_absolute_indent) { + let whitespace = parse_simple_whitespace(config, &mut speculative_state)?; + let comment = parse_comment(config, &mut speculative_state)?; + if let Some(newline) = parse_newline(config, &mut speculative_state)? { + *state = speculative_state; + return Ok(ParsedEmptyLine::Line(EmptyLine { + indent, + whitespace, + comment, + newline, + })); + } + } + Ok(ParsedEmptyLine::NoIndent) +} + +fn _parse_empty_lines<'a>( + config: &Config<'a>, + state: &mut State<'a>, + override_absolute_indent: Option<&'a str>, +) -> Result, EmptyLine<'a>)>> { + let mut lines = vec![]; + loop { + let last_state = state.clone(); + let parsed_line = parse_empty_line(config, state, override_absolute_indent)?; + if *state == last_state { + break; + } + match parsed_line { + ParsedEmptyLine::NoIndent => break, + ParsedEmptyLine::Line(l) => lines.push((state.clone(), l)), + } + } + Ok(lines) +} + +pub fn parse_empty_lines<'a>( + config: &Config<'a>, + state: &mut State<'a>, + override_absolute_indent: Option<&'a str>, +) -> Result>> { + // If override_absolute_indent is Some, then we need to parse all lines up to and including the + // last line that is indented at our level. These all belong to the footer and not to the next + // line's leading_lines. + // + // We don't know what the last line with indent=True is, and there could be indent=False lines + // interspersed with indent=True lines, so we need to speculatively parse all possible empty + // lines, and then unwind to find the last empty line with indent=True. + let mut speculative_state = state.clone(); + let mut lines = _parse_empty_lines(config, &mut speculative_state, override_absolute_indent)?; + + if override_absolute_indent.is_some() { + // Remove elements from the end until we find an indented line. + while let Some((_, empty_line)) = lines.last() { + if empty_line.indent { + break; + } + lines.pop(); + } + } + + if let Some((final_state, _)) = lines.last() { + // update the state to match the last line that we captured + *state = final_state.clone(); + } + + Ok(lines.into_iter().map(|(_, e)| e).collect()) +} + +pub fn parse_comment<'a>(config: &Config<'a>, state: &mut State) -> Result>> { + if let Some(comment_match) = + COMMENT_RE.find(config.get_line_after_column(state.line, state.column_byte)?) + { + let comment_str = comment_match.as_str(); + advance_this_line( + config, + state, + comment_str.chars().count(), + comment_str.len(), + )?; + return Ok(Some(Comment(comment_str))); + } + Ok(None) +} + +pub fn parse_newline<'a>(config: &Config<'a>, state: &mut State) -> Result>> { + if let Some(newline_match) = + NEWLINE_RE.find(config.get_line_after_column(state.line, state.column_byte)?) + { + let newline_str = newline_match.as_str(); + advance_this_line( + config, + state, + newline_str.chars().count(), + newline_str.len(), + )?; + if state.column_byte != config.get_line(state.line)?.len() { + return Err(WhitespaceError::InternalError(format!( + "Found newline at ({}, {}) but it's not EOL", + state.line, state.column + ))); + } + if state.line < config.lines.len() { + advance_to_next_line(config, state)?; + } + return Ok(Some(Newline( + if newline_str == config.default_newline { + None + } else { + Some(newline_str) + }, + Fakeness::Real, + ))); + } + + // If we're at the end of the file but not on BOL, that means this is the fake + // newline inserted by the tokenizer. + if state.byte_offset == config.input.len() && state.column_byte != 0 { + return Ok(Some(Newline(None, Fakeness::Fake))); + } + Ok(None) +} + +pub fn parse_optional_trailing_whitespace<'a>( + config: &Config<'a>, + state: &mut State, +) -> Result>> { + let mut speculative_state = state.clone(); + let whitespace = parse_simple_whitespace(config, &mut speculative_state)?; + let comment = parse_comment(config, &mut speculative_state)?; + if let Some(newline) = parse_newline(config, &mut speculative_state)? { + *state = speculative_state; + Ok(Some(TrailingWhitespace { + whitespace, + comment, + newline, + })) + } else { + Ok(None) + } +} + +pub fn parse_trailing_whitespace<'a>( + config: &Config<'a>, + state: &mut State, +) -> Result> { + match parse_optional_trailing_whitespace(config, state)? { + Some(ws) => Ok(ws), + _ => Err(WhitespaceError::TrailingWhitespaceError), + } +} + +fn parse_indent<'a>( + config: &Config<'a>, + state: &mut State, + override_absolute_indent: Option<&'a str>, +) -> Result { + let absolute_indent = override_absolute_indent.unwrap_or(state.absolute_indent); + if state.column_byte != 0 { + if state.column_byte == config.get_line(state.line)?.len() + && state.line == config.lines.len() + { + Ok(false) + } else { + Err(WhitespaceError::InternalError( + "Column should not be 0 when parsing an index".to_string(), + )) + } + } else { + Ok( + if config + .get_line_after_column(state.line, state.column_byte)? + .starts_with(absolute_indent) + { + state.column_byte += absolute_indent.len(); + state.column += absolute_indent.chars().count(); + state.byte_offset += absolute_indent.len(); + true + } else { + false + }, + ) + } +} + +fn advance_to_next_line<'a>(config: &Config<'a>, state: &mut State) -> Result<()> { + let cur_line = config.get_line(state.line)?; + state.byte_offset += cur_line.len() - state.column_byte; + state.column = 0; + state.column_byte = 0; + state.line += 1; + Ok(()) +} + +fn advance_this_line<'a>( + config: &Config<'a>, + state: &mut State, + char_count: usize, + offset: usize, +) -> Result<()> { + let cur_line = config.get_line(state.line)?; + if cur_line.len() < state.column_byte + offset { + return Err(WhitespaceError::InternalError(format!( + "Tried to advance past line {}'s end", + state.line + ))); + } + state.column += char_count; + state.column_byte += offset; + state.byte_offset += offset; + Ok(()) +} + +pub fn parse_simple_whitespace<'a>( + config: &Config<'a>, + state: &mut State, +) -> Result> { + let capture_ws = |line, col| -> Result<&'a str> { + let x = config.get_line_after_column(line, col); + let x = x?; + Ok(SIMPLE_WHITESPACE_RE + .find(x) + .expect("SIMPLE_WHITESPACE_RE supports 0-length matches, so it must always match") + .as_str()) + }; + let start_offset = state.byte_offset; + let mut prev_line: &str; + loop { + prev_line = capture_ws(state.line, state.column_byte)?; + if !prev_line.contains('\\') { + break; + } + advance_to_next_line(config, state)?; + } + advance_this_line(config, state, prev_line.chars().count(), prev_line.len())?; + + Ok(SimpleWhitespace( + &config.input[start_offset..state.byte_offset], + )) +} + +pub fn parse_parenthesizable_whitespace<'a>( + config: &Config<'a>, + state: &mut State<'a>, +) -> Result> { + if state.is_parenthesized { + if let Some(ws) = parse_parenthesized_whitespace(config, state)? { + return Ok(ParenthesizableWhitespace::ParenthesizedWhitespace(ws)); + } + } + parse_simple_whitespace(config, state).map(ParenthesizableWhitespace::SimpleWhitespace) +} + +pub fn parse_parenthesized_whitespace<'a>( + config: &Config<'a>, + state: &mut State<'a>, +) -> Result>> { + if let Some(first_line) = parse_optional_trailing_whitespace(config, state)? { + let empty_lines = _parse_empty_lines(config, state, None)? + .into_iter() + .map(|(_, line)| line) + .collect(); + let indent = parse_indent(config, state, None)?; + let last_line = parse_simple_whitespace(config, state)?; + Ok(Some(ParenthesizedWhitespace { + first_line, + empty_lines, + indent, + last_line, + })) + } else { + Ok(None) + } +} diff --git a/native/libcst/tests/fixtures/class_craziness.py b/native/libcst/tests/fixtures/class_craziness.py new file mode 100644 index 000000000..67afc6496 --- /dev/null +++ b/native/libcst/tests/fixtures/class_craziness.py @@ -0,0 +1,28 @@ +class Foo: ... + +class Bar : + ... + +class Old ( ) : + gold : int + + +class OO ( Foo ) : ... + +class OOP ( Foo , Bar, ) : pass + +class OOPS ( + Foo , + +) : + pass + +class OOPSI ( Foo, * Bar , metaclass = + foo , +): pass + +class OOPSIE ( list , *args, kw = arg , ** kwargs ) : + what : does_this_even = mean + + def __init__(self) -> None: + self.foo: Bar = Bar() diff --git a/native/libcst/tests/fixtures/comments.py b/native/libcst/tests/fixtures/comments.py new file mode 100644 index 000000000..80830d21f --- /dev/null +++ b/native/libcst/tests/fixtures/comments.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# fmt: on +# Some license here. +# +# Has many lines. Many, many lines. +# Many, many, many lines. +"""Module docstring. + +Possibly also many, many lines. +""" + +import os.path +import sys + +import a +from b.c.d.e import X # some noqa comment + +try: + import fast +except ImportError: + import slow as fast + + +# Some comment before a function. +y = 1 +( + # some strings + y # type: ignore +) + + +def function(default=None): + """Docstring comes first. + + Possibly many lines. + """ + # FIXME: Some comment about why this function is crap but still in production. + import inner_imports + + if inner_imports.are_evil(): + # Explains why we have this if. + # In great detail indeed. + x = X() + return x.method1() # type: ignore + + + # This return is also commented for some reason. + return default + + +# Explains why we use global state. +GLOBAL_STATE = {"a": a(1), "b": a(2), "c": a(3)} + + +# Another comment! +# This time two lines. + + +class Foo: + """Docstring for class Foo. Example from Sphinx docs.""" + + #: Doc comment for class attribute Foo.bar. + #: It can have multiple lines. + bar = 1 + + flox = 1.5 #: Doc comment for Foo.flox. One line only. + + baz = 2 + """Docstring for class attribute Foo.baz.""" + + def __init__(self): + #: Doc comment for instance attribute qux. + self.qux = 3 + + self.spam = 4 + """Docstring for instance attribute spam.""" + + +#'

This is pweave!

+ + +@fast(really=True) +async def wat(): + # This comment, for some reason \ + # contains a trailing backslash. + async with X.open_async() as x: # Some more comments + result = await x.method1() + # Comment after ending a block. + if result: + print("A OK", file=sys.stdout) + # Comment between things. + print() + + +if True: # Hanging comments + # because why not + pass + +# Some closing comments. +# Maybe Vim or Emacs directives for formatting. +# Who knows. diff --git a/native/libcst/tests/fixtures/comparisons.py b/native/libcst/tests/fixtures/comparisons.py new file mode 100644 index 000000000..126ea15e0 --- /dev/null +++ b/native/libcst/tests/fixtures/comparisons.py @@ -0,0 +1,21 @@ +if not 1: pass +if 1 and 1: pass +if 1 or 1: pass +if not not not 1: pass +if not 1 and 1 and 1: pass +if 1 and 1 or 1 and 1 and 1 or not 1 and 1: pass + +if 1: pass +#x = (1 == 1) +if 1 == 1: pass +if 1 != 1: pass +if 1 < 1: pass +if 1 > 1: pass +if 1 <= 1: pass +if 1 >= 1: pass +if x is x: pass +#if x is not x: pass +#if 1 in (): pass +#if 1 not in (): pass +if 1 < 1 > 1 == 1 >= 1 <= 1 != 1 in 1 in x is x is x: pass +#if 1 < 1 > 1 == 1 >= 1 <= 1 != 1 in 1 not in x is x is not x: pass diff --git a/native/libcst/tests/fixtures/decorated_function_without_body.py b/native/libcst/tests/fixtures/decorated_function_without_body.py new file mode 100644 index 000000000..d7c96e02f --- /dev/null +++ b/native/libcst/tests/fixtures/decorated_function_without_body.py @@ -0,0 +1,3 @@ +@hello +@bello +def f () : ... \ No newline at end of file diff --git a/native/libcst/tests/fixtures/dysfunctional_del.py b/native/libcst/tests/fixtures/dysfunctional_del.py new file mode 100644 index 000000000..a3fa4575a --- /dev/null +++ b/native/libcst/tests/fixtures/dysfunctional_del.py @@ -0,0 +1,14 @@ +# dysfunctional_del.py + +del a + +del a[1] + +del a.b.c +del ( a, b , c ) +del [ a, b , c ] + +del a , b, c + + +del a[1] , b [ 2] \ No newline at end of file diff --git a/native/libcst/tests/fixtures/expr.py b/native/libcst/tests/fixtures/expr.py new file mode 100644 index 000000000..c1c4e9b7f --- /dev/null +++ b/native/libcst/tests/fixtures/expr.py @@ -0,0 +1,375 @@ +... +"some_string" +b"\\xa3" +Name +None +True +False +1 +1.0 +1j +True or False +True or False or None +True and False +True and False and None +(Name1 and Name2) or Name3 +Name1 and Name2 or Name3 +Name1 or (Name2 and Name3) +Name1 or Name2 and Name3 +(Name1 and Name2) or (Name3 and Name4) +Name1 and Name2 or Name3 and Name4 +Name1 or (Name2 and Name3) or Name4 +Name1 or Name2 and Name3 or Name4 +v1 << 2 +1 >> v2 +1 % finished +1 + v2 - v3 * 4 ^ 5 ** v6 / 7 // 8 +((1 + v2) - (v3 * 4)) ^ (((5 ** v6) / 7) // 8) +not great +~great ++value +-1 +~int and not v1 ^ 123 + v2 | True +(~int) and (not ((v1 ^ (123 + v2)) | True)) ++(really ** -(confusing ** ~(operator ** -precedence))) +flags & ~ select.EPOLLIN and waiters.write_task is not None +lambda arg: None +lambda arg : None +lambda a=True: a +lambda a=True : a +lambda a, b, c=True: a +lambda a, b, c=True, *, d=(1 << v2), e='str': a +lambda a, b, c=True, *vararg, d=(v1 << 2), e='str', **kwargs: a + b +lambda a, b, c=True, *vararg, d=(v1 << 2), e='str', **kwargs : a + b +manylambdas = lambda x=lambda y=lambda z=1: z: y(): x() +foo = (lambda port_id, ignore_missing: {"port1": port1_resource, "port2": port2_resource}[port_id]) +1 if True else 2 +str or None if True else str or bytes or None +(str or None) if True else (str or bytes or None) +str or None if (1 if True else 2) else str or bytes or None +(str or None) if (1 if True else 2) else (str or bytes or None) +((super_long_variable_name or None) if (1 if super_long_test_name else 2) else (str or bytes or None)) +{'2.7': dead, '3.7': (long_live or die_hard)} +{'2.7': dead, '3.7': (long_live or die_hard), **{'3.6': verygood}} +{**a, **b, **c} +{"2.7", "3.6", "3.7", "3.8", "3.9"} +{"2.7", "3.6", "3.7", "3.8", "3.9",} +{"2.7", "3.6", "3.7", "3.8", "3.9", ("4.0" if gilectomy else "3.10")} +({"a": "b"}, (True or False), (+value), "string", b"bytes") or None +() +(1,) +(1, 2) +(1, 2, 3) +[] +[ ] +[ 1 , ] +[1, 2, 3, 4, 5, 6, 7, 8, 9, (10 or A), (11 or B), (12 or C)] +[ + 1, + 2, + 3, +] +[*a] +[*range(10)] +[ + *a, + 4, + 5, +] +[ + 4, + *a, + 5, +] +[ + this_is_a_very_long_variable_which_will_force_a_delimiter_split, + element, + another, + *more, +] +{ } +{ 1 , } +{ 1 : 2 , } +{i for i in (1, 2, 3)} +{(i ** 2) for i in (1, 2, 3)} +{(i ** 2) for i, _ in ((1, "a"), (2, "b"), (3, "c"))} +{((i ** 2) + j) for i in (1, 2, 3) for j in (1, 2, 3)} +[i for i in (1, 2, 3)] +[(i ** 2) for i in (1, 2, 3)] +[(i ** 2) for i, _ in ((1, "a"), (2, "b"), (3, "c"))] +[((i ** 2) + j) for i in (1, 2, 3) for j in (1, 2, 3)] +{i: 0 for i in (1, 2, 3)} +{i: j for i, j in ((1, "a"), (2, "b"), (3, "c"))} +{a: b * 2 for a, b in dictionary.items()} +{a: b * -2 for a, b in dictionary.items()} +{ + k: v + for k, v in this_is_a_very_long_variable_which_will_cause_a_trailing_comma_which_breaks_the_comprehension +} +Python3 > Python2 > COBOL +Life is Life +call() +call(arg) +call(kwarg="hey") +call(arg, kwarg="hey") +call(arg, another, kwarg="hey", **kwargs) +call( + this_is_a_very_long_variable_which_will_force_a_delimiter_split, + arg, + another, + kwarg="hey", + **kwargs, +) # note: no trailing comma pre-3.6 +call(*gidgets[:2]) +call(a, *gidgets[:2]) +call(**screen_kwargs) +call(b, **screen_kwargs) +call()()()()()() +call(**self.screen_kwargs) +call(b, **self.screen_kwargs) +call(a=a, *args) +call(a=a, *args,) +call(a=a, **kwargs) +call(a=a, **kwargs,) +lukasz.langa.pl +call.me(maybe) +1 .real +1.0 .real +....__class__ +list[str] +dict[str, int] +tuple[str, ...] +tuple[str, int, float, dict[str, int]] +tuple[ + str, + int, + float, + dict[str, int], +] +very_long_variable_name_filters: t.List[ + t.Tuple[str, t.Union[str, t.List[t.Optional[str]]]], +] +xxxx_xxxxx_xxxx_xxx: Callable[..., List[SomeClass]] = classmethod( # type: ignore + sync(async_xxxx_xxx_xxxx_xxxxx_xxxx_xxx.__func__) +) +xxxx_xxx_xxxx_xxxxx_xxxx_xxx: Callable[..., List[SomeClass]] = classmethod( # type: ignore + sync(async_xxxx_xxx_xxxx_xxxxx_xxxx_xxx.__func__) +) +xxxx_xxx_xxxx_xxxxx_xxxx_xxx: Callable[..., List[SomeClass]] = classmethod( + sync(async_xxxx_xxx_xxxx_xxxxx_xxxx_xxx.__func__) +) # type: ignore +(str or None) if (sys.version_info[0] > (3,)) else (str or bytes or None) +{"2.7": dead, "3.7": long_live or die_hard} +{"2.7", "3.6", "3.7", "3.8", "3.9", "4.0" if gilectomy else "3.10"} +[1, 2, 3, 4, 5, 6, 7, 8, 9, 10 or A, 11 or B, 12 or C] +(SomeName) +SomeName +(Good, Bad, Ugly) +(i for i in (1, 2, 3)) +((i ** 2) for i in (1, 2, 3)) +((i ** 2) for i, _ in ((1, "a"), (2, "b"), (3, "c"))) +(((i ** 2) + j) for i in (1, 2, 3) for j in (1, 2, 3)) +(*starred,) +{ + "id": "1", + "type": "type", + "started_at": now(), + "ended_at": now() + timedelta(days=10), + "priority": 1, + "import_session_id": 1, + **kwargs, +} +a = (1,) +b = (1,) +c = 1 +d = (1,) + a + (2,) +e = (1,).count(1) +f = 1, *range(10) +g = 1, *"ten" +what_is_up_with_those_new_coord_names = (coord_names + set(vars_to_create)) + set( + vars_to_remove +) +what_is_up_with_those_new_coord_names = (coord_names | set(vars_to_create)) - set( + vars_to_remove +) +result = ( + session.query(models.Customer.id) + .filter( + models.Customer.account_id == account_id, models.Customer.email == email_address + ) + .order_by(models.Customer.id.asc()) + .all() +) +result = ( + session.query(models.Customer.id) + .filter( + models.Customer.account_id == account_id, models.Customer.email == email_address + ) + .order_by( + models.Customer.id.asc(), + ) + .all() +) +Ø = set() +authors.łukasz.say_thanks() +authors.lukasz.say_thanks() +mapping = { + A: 0.25 * (10.0 / 12), + B: 0.1 * (10.0 / 12), + C: 0.1 * (10.0 / 12), + D: 0.1 * (10.0 / 12), +} +[ + a + for + [ + a , + ] + in + [ + [ 1 ] + ] +] + +def gen(): + if 1: + if 2: + if 3: + if not is_value_of_type( + subkey, + type_args[0], + # key type is always invariant + invariant_check=True, + ): + return False + yield from outside_of_generator + a = yield + b = yield + c = yield + + +async def f(): + await some.complicated[0].call(with_args=(True or (1 is not 1))) + +lambda : None + +print(*[] or [1]) +print(**{1: 3} if False else {x: x for x in range(3)}) +print(*lambda x: x) +assert not Test, "Short message" +assert this is ComplexTest and not requirements.fit_in_a_single_line( + force=False +), "Short message" +assert parens is TooMany +for (x,) in (1,), (2,), (3,): + ... +for y in (): + ... +for z in (i for i in (1, 2, 3)): + ... +for i in call(): + ... +for j in 1 + (2 + 3): + ... +else: + ... +while this and that: + ... +while this and that: + ... +else: + ... +for ( + addr_family, + addr_type, + addr_proto, + addr_canonname, + addr_sockaddr, +) in socket.getaddrinfo("google.com", "http"): + pass +a = ( + aaaa.bbbb.cccc.dddd.eeee.ffff.gggg.hhhh.iiii.jjjj.kkkk.llll.mmmm.nnnn.oooo.pppp + in qqqq.rrrr.ssss.tttt.uuuu.vvvv.xxxx.yyyy.zzzz +) +a = ( + aaaa.bbbb.cccc.dddd.eeee.ffff.gggg.hhhh.iiii.jjjj.kkkk.llll.mmmm.nnnn.oooo.pppp + not in qqqq.rrrr.ssss.tttt.uuuu.vvvv.xxxx.yyyy.zzzz +) +a = ( + aaaa.bbbb.cccc.dddd.eeee.ffff.gggg.hhhh.iiii.jjjj.kkkk.llll.mmmm.nnnn.oooo.pppp + is qqqq.rrrr.ssss.tttt.uuuu.vvvv.xxxx.yyyy.zzzz +) +a = ( + aaaa.bbbb.cccc.dddd.eeee.ffff.gggg.hhhh.iiii.jjjj.kkkk.llll.mmmm.nnnn.oooo.pppp + is not qqqq.rrrr.ssss.tttt.uuuu.vvvv.xxxx.yyyy.zzzz +) +if ( + threading.current_thread() != threading.main_thread() + and threading.current_thread() != threading.main_thread() + or signal.getsignal(signal.SIGINT) != signal.default_int_handler +): + return True +if ( + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + | aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +): + return True +if ( + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + & aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +): + return True +if ( + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +): + return True +if ( + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + - aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +): + return True +if ( + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + * aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +): + return True +if ( + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + / aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +): + return True +if ( + ~aaaa.a + aaaa.b - aaaa.c * aaaa.d / aaaa.e + | aaaa.f & aaaa.g % aaaa.h ^ aaaa.i << aaaa.k >> aaaa.l ** aaaa.m // aaaa.n +): + return True +if ( + ~aaaaaaaa.a + aaaaaaaa.b - aaaaaaaa.c @ aaaaaaaa.d / aaaaaaaa.e + | aaaaaaaa.f & aaaaaaaa.g % aaaaaaaa.h + ^ aaaaaaaa.i << aaaaaaaa.k >> aaaaaaaa.l ** aaaaaaaa.m // aaaaaaaa.n +): + return True +if ( + ~aaaaaaaaaaaaaaaa.a + + aaaaaaaaaaaaaaaa.b + - aaaaaaaaaaaaaaaa.c * aaaaaaaaaaaaaaaa.d @ aaaaaaaaaaaaaaaa.e + | aaaaaaaaaaaaaaaa.f & aaaaaaaaaaaaaaaa.g % aaaaaaaaaaaaaaaa.h + ^ aaaaaaaaaaaaaaaa.i + << aaaaaaaaaaaaaaaa.k + >> aaaaaaaaaaaaaaaa.l ** aaaaaaaaaaaaaaaa.m // aaaaaaaaaaaaaaaa.n +): + return True +aaaaaaaaaaaaaaaa + aaaaaaaaaaaaaaaa - aaaaaaaaaaaaaaaa * ( + aaaaaaaaaaaaaaaa + aaaaaaaaaaaaaaaa +) / (aaaaaaaaaaaaaaaa + aaaaaaaaaaaaaaaa + aaaaaaaaaaaaaaaa) +aaaaaaaaaaaaaaaa + aaaaaaaaaaaaaaaa +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa >> aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa << aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +bbbb >> bbbb * bbbb +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ^ bbbb.a & aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ^ aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa + +a += B +a[x] @= foo().bar +this.is_not >>= a.monad +last_call() +# standalone comment at ENDMARKER diff --git a/native/libcst/tests/fixtures/expr_statement.py b/native/libcst/tests/fixtures/expr_statement.py new file mode 100644 index 000000000..4ef73f081 --- /dev/null +++ b/native/libcst/tests/fixtures/expr_statement.py @@ -0,0 +1,11 @@ +1 +1, 2, 3 +x = 1 +x = 1, 2, 3 +x = y = z = 1, 2, 3 +x, y, z = 1, 2, 3 +abc = a, b, c = x, y, z = xyz = 1, 2, (3, 4) + +( ( ( ... ) ) ) + +a , = b \ No newline at end of file diff --git a/native/libcst/tests/fixtures/fun_with_func_defs.py b/native/libcst/tests/fixtures/fun_with_func_defs.py new file mode 100644 index 000000000..1a78d687d --- /dev/null +++ b/native/libcst/tests/fixtures/fun_with_func_defs.py @@ -0,0 +1,146 @@ +def f(a, /,): pass +def f(a, /, c, d, e): pass +def f(a, /, c, *, d, e): pass +def f(a, /, c, *, d, e, **kwargs): pass +def f(a=1, /,): pass +def f(a=1, /, b=2, c=4): pass +def f(a=1, /, b=2, *, c=4): pass +def f(a=1, /, b=2, *, c): pass +def f(a=1, /, b=2, *, c=4, **kwargs): pass +def f(a=1, /, b=2, *, c, **kwargs,): pass + + +def g( + a, + /, +): + pass + + +def f(a, /, c, d, e): + pass + + +def f(a, /, c, *, d, e): + pass + + +def f( + a, + /, + c, + *, + d, + e, + **kwargs, +): + pass + + +def f( + a=1, + /, +): + pass + + +def f(a=1, /, b=2, c=4): + pass + + +def f(a=1, /, b=2, *, c=4): + pass + + +def f(a=1, /, b=2, *, c): + pass + + +def f( + a=1, + /, + b=2, + *, + c=4, + **kwargs, +): + pass + + +def f( + a=1, + /, + b=2, + *, + c, + **kwargs, +): + pass + + +async def foo ( + bar : Baz , +) -> zooooooooom : ... + + +async def foo(bar : Baz = 0 ) : ... + +async def foo() -> Bar: ... + +async def outer( + foo +) -> Bar : + def inner(lol: Lol) -> None: + async def core (): + await lol + def second(inner): + pass + +def stars ( + yes : bool = True , + / , + noes : List[bool] = [ * falses ], + * all : The[Rest], + but : Wait[Theres[More]] , + ** it : ends[now] , + +) -> ret: + pass + +def stars ( + yes : bool = True , + / , + noes : List[bool] = [ * falses ], + * all : The[Rest], + but : Wait[Theres[More]] , + ** it : ends[now[without_a_comma]] + +) -> ret : + pass + + +def foo(bar: (yield)) -> (yield): something: (yield another) + +def foo( bar: (yield)) -> (yield) : + something: (yield another) + return 3 # no + return # yes + + +def f(): + for (yield 1)[1] in [1]: + pass + + +@decorators +# foo +@woohoo +def f(): + pass + +@getattr(None, '', lambda a: lambda b: a(b+1)) +def f(): ... + + +@a(now_this = lol) +def f(): ... diff --git a/native/libcst/tests/fixtures/global_nonlocal.py b/native/libcst/tests/fixtures/global_nonlocal.py new file mode 100644 index 000000000..a9839aef9 --- /dev/null +++ b/native/libcst/tests/fixtures/global_nonlocal.py @@ -0,0 +1,4 @@ +global a +global b , c, d +nonlocal a +nonlocal a , b \ No newline at end of file diff --git a/native/libcst/tests/fixtures/import.py b/native/libcst/tests/fixtures/import.py new file mode 100644 index 000000000..571e3640c --- /dev/null +++ b/native/libcst/tests/fixtures/import.py @@ -0,0 +1,19 @@ +# 'import' dotted_as_names +import sys +import time, sys +# 'from' dotted_name 'import' ('*' | '(' import_as_names ')' | import_as_names) +from time import time +from time import (time) +from sys import path, argv +from sys import (path, argv) +from sys import (path, argv,) +from sys import * + + +from a import (b, ) +from . import a +from .a import b +from ... import a +from ...a import b +from .... import a +from ...... import a \ No newline at end of file diff --git a/native/libcst/tests/fixtures/indents_but_no_eol_before_eof.py b/native/libcst/tests/fixtures/indents_but_no_eol_before_eof.py new file mode 100644 index 000000000..22fd13095 --- /dev/null +++ b/native/libcst/tests/fixtures/indents_but_no_eol_before_eof.py @@ -0,0 +1,4 @@ +if 1: + if 2: + if 3: + pass \ No newline at end of file diff --git a/native/libcst/tests/fixtures/just_a_comment_without_nl.py b/native/libcst/tests/fixtures/just_a_comment_without_nl.py new file mode 100644 index 000000000..56e1a2f86 --- /dev/null +++ b/native/libcst/tests/fixtures/just_a_comment_without_nl.py @@ -0,0 +1 @@ +# just a comment without a newline \ No newline at end of file diff --git a/native/libcst/tests/fixtures/raise.py b/native/libcst/tests/fixtures/raise.py new file mode 100644 index 000000000..286138627 --- /dev/null +++ b/native/libcst/tests/fixtures/raise.py @@ -0,0 +1,4 @@ +raise +raise foo +raise foo from bar +raise lol() from f() + 1 \ No newline at end of file diff --git a/native/libcst/tests/fixtures/smol_statements.py b/native/libcst/tests/fixtures/smol_statements.py new file mode 100644 index 000000000..93687bceb --- /dev/null +++ b/native/libcst/tests/fixtures/smol_statements.py @@ -0,0 +1,4 @@ +def f(): + pass ; break ; continue ; return ; return foo + + assert foo , bar ; a += 2 \ No newline at end of file diff --git a/native/libcst/tests/fixtures/spacious_spaces.py b/native/libcst/tests/fixtures/spacious_spaces.py new file mode 100644 index 000000000..5c979eee1 --- /dev/null +++ b/native/libcst/tests/fixtures/spacious_spaces.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/native/libcst/tests/fixtures/suicidal_slices.py b/native/libcst/tests/fixtures/suicidal_slices.py new file mode 100644 index 000000000..8d9566e83 --- /dev/null +++ b/native/libcst/tests/fixtures/suicidal_slices.py @@ -0,0 +1,28 @@ +slice[0] +slice[0:1] +slice[0:1:2] +slice[:] +slice[:-1] +slice[1:] +slice[::-1] +slice[d :: d + 1] +slice[:c, c - 1] +numpy[:, 0:1] +numpy[:, :-1] +numpy[0, :] +numpy[:, i] +numpy[0, :2] +numpy[:N, 0] +numpy[:2, :4] +numpy[2:4, 1:5] +numpy[4:, 2:] +numpy[:, (0, 1, 2, 5)] +numpy[0, [0]] +numpy[:, [i]] +numpy[1 : c + 1, c] +numpy[-(c + 1) :, d] +numpy[:, l[-2]] +numpy[:, ::-1] +numpy[np.newaxis, :] + +( spaces [:: , a : , a : a : a , ] ) \ No newline at end of file diff --git a/native/libcst/tests/fixtures/super_strings.py b/native/libcst/tests/fixtures/super_strings.py new file mode 100644 index 000000000..76b1b2ddd --- /dev/null +++ b/native/libcst/tests/fixtures/super_strings.py @@ -0,0 +1,28 @@ +_ = "" +_ = '' +_ = """""" +_ = '''''' + +_ = 'a' "string" 'that' r"is" 'concatenated ' + +b"string " +b"and non f" rb'string' + +( + "parenthesized" + "concatenated" + """triple + quoted + """ + +) + +_ = f"string" + +f"string" "bonanza" f'starts' r"""here""" + +_ = f"something {{**not** an expression}} {but(this._is)} {{and this isn't.}} end" + +_(f"ok { expr = !r: aosidjhoi } end") + +print(f"{self.ERASE_CURRENT_LINE}{self._human_seconds(elapsed_time)} {percent:.{self.pretty_precision}f}% complete, {self.estimate_completion(elapsed_time, finished, left)} estimated for {left} files to go...") diff --git a/native/libcst/tests/fixtures/terrible_tries.py b/native/libcst/tests/fixtures/terrible_tries.py new file mode 100644 index 000000000..91d6831ee --- /dev/null +++ b/native/libcst/tests/fixtures/terrible_tries.py @@ -0,0 +1,71 @@ +#foo. + +try : + bar() + +finally : + pass + + +try : + pass + + # foo + +except lol as LOL : + + pass + +except : + + # foo + + pass + +else : + + pass + +finally : + + foo + +try: + pass +except: + pass +finally: + pass + + +try: + + # 1 + + try: + + # 2 + + pass + + # 3 + + # 4 + + finally: + + # 5 + + pass + + # 6 + + # 7 + +except foo: + + #8 + + pass + + #9 diff --git a/native/libcst/tests/fixtures/trailing_comment_without_nl.py b/native/libcst/tests/fixtures/trailing_comment_without_nl.py new file mode 100644 index 000000000..0c58f18d3 --- /dev/null +++ b/native/libcst/tests/fixtures/trailing_comment_without_nl.py @@ -0,0 +1,4 @@ + + + +# hehehe >:) \ No newline at end of file diff --git a/native/libcst/tests/fixtures/tuple_shenanigans.py b/native/libcst/tests/fixtures/tuple_shenanigans.py new file mode 100644 index 000000000..f31c64523 --- /dev/null +++ b/native/libcst/tests/fixtures/tuple_shenanigans.py @@ -0,0 +1,28 @@ +(1, 2) +(1, 2, 3) + +# alright here we go. + +() +( # evil >:) + # evil >:( +) # ... +(1,) +( * 1 , * 2 ,) +*_ = (l,) +() = x +( ) = ( x, ) +(x) = (x) +( x , ) = x +( x , *y , * z , ) = l +( x , *y , * z , ) = ( x , *y , * z , ) = ( x , *y , * z , x ) +( + x , # :) + bar, * + baz + , +) =\ +( + (let, *s, ( ) ) , + nest , them , ( * t , * u , * p , l , * e , s , ) +) \ No newline at end of file diff --git a/native/libcst/tests/fixtures/vast_emptiness.py b/native/libcst/tests/fixtures/vast_emptiness.py new file mode 100644 index 000000000..e69de29bb diff --git a/native/libcst/tests/fixtures/with_wickedness.py b/native/libcst/tests/fixtures/with_wickedness.py new file mode 100644 index 000000000..7cb5c67d3 --- /dev/null +++ b/native/libcst/tests/fixtures/with_wickedness.py @@ -0,0 +1,13 @@ +# with_wickedness + +with foo : ... + +async def f(): + async with foo as bar: + + with bar: + pass + + async with foo(1+1) as bar , 1 as (a, b, ) , 2 as [a, b] , 3 as a[b] : + pass + diff --git a/native/libcst/tests/fixtures/wonky_walrus.py b/native/libcst/tests/fixtures/wonky_walrus.py new file mode 100644 index 000000000..d0916ab80 --- /dev/null +++ b/native/libcst/tests/fixtures/wonky_walrus.py @@ -0,0 +1,13 @@ +( foo := 5 ) + +any((lastNum := num) == 1 for num in [1, 2, 3]) + +[(lastNum := num) == 1 for num in [1, 2, 3]] + +while f := x(): + pass + +if f := x(): pass + +f(y:=1) +f(x, y := 1 ) \ No newline at end of file diff --git a/native/libcst/tests/parser_roundtrip.rs b/native/libcst/tests/parser_roundtrip.rs new file mode 100644 index 000000000..2b8c79dd8 --- /dev/null +++ b/native/libcst/tests/parser_roundtrip.rs @@ -0,0 +1,50 @@ +use difference::assert_diff; +use itertools::Itertools; +use libcst_native::{parse_module, prettify_error, Codegen}; +use std::{ + iter::once, + path::{Component, PathBuf}, +}; + +fn all_fixtures() -> impl Iterator { + let mut path = PathBuf::from(file!()); + path.pop(); + path = path + .components() + .skip(1) + .chain(once(Component::Normal("fixtures".as_ref()))) + .collect(); + + path.read_dir().expect("read_dir").into_iter().map(|file| { + let path = file.unwrap().path(); + let contents = std::fs::read_to_string(&path).expect("reading file"); + (path, contents) + }) +} + +#[test] +fn roundtrip_fixtures() { + for (path, input) in all_fixtures() { + let input = if let Some(stripped) = input.strip_prefix('\u{feff}') { + stripped + } else { + &input + }; + let m = match parse_module(input, None) { + Ok(m) => m, + Err(e) => panic!("{}", prettify_error(e, format!("{:#?}", path).as_ref())), + }; + let mut state = Default::default(); + m.codegen(&mut state); + let generated = state.to_string(); + if generated != input { + let got = visualize(&generated); + let expected = visualize(input); + assert_diff!(expected.as_ref(), got.as_ref(), "", 0); + } + } +} + +fn visualize(s: &str) -> String { + s.replace(' ', "▩").lines().join("↩\n") +} diff --git a/native/libcst_derive/Cargo.toml b/native/libcst_derive/Cargo.toml new file mode 100644 index 000000000..95bf4d2de --- /dev/null +++ b/native/libcst_derive/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "libcst_derive" +version = "0.1.0" +edition = "2018" + +[lib] +proc-macro = true + +[dependencies] +syn = "1.0" +quote = "1.0" diff --git a/native/libcst_derive/src/codegen.rs b/native/libcst_derive/src/codegen.rs new file mode 100644 index 000000000..208d5104f --- /dev/null +++ b/native/libcst_derive/src/codegen.rs @@ -0,0 +1,63 @@ +use proc_macro::TokenStream; +use quote::{quote, quote_spanned}; +use syn::{self, spanned::Spanned, Data, DataEnum, DeriveInput, Fields, FieldsUnnamed}; + +pub(crate) fn impl_codegen(ast: &DeriveInput) -> TokenStream { + match &ast.data { + Data::Enum(e) => impl_enum(ast, e), + Data::Struct(s) => quote_spanned! { + s.struct_token.span() => + compile_error!("Struct type is not supported") + } + .into(), + Data::Union(u) => quote_spanned! { + u.union_token.span() => + compile_error!("Union type is not supported") + } + .into(), + } +} + +fn impl_enum(ast: &DeriveInput, e: &DataEnum) -> TokenStream { + let mut varnames = vec![]; + for var in e.variants.iter() { + match &var.fields { + Fields::Named(n) => { + return quote_spanned! { + n.span() => + compile_error!("Named enum fields not supported") + } + .into() + } + f @ Fields::Unit => { + return quote_spanned! { + f.span() => + compile_error!("Empty enum variants not supported") + } + .into() + } + Fields::Unnamed(FieldsUnnamed { unnamed, .. }) => { + if unnamed.len() > 1 { + return quote_spanned! { + unnamed.span() => + compile_error!("Multiple unnamed fields not supported") + } + .into(); + } + varnames.push(&var.ident); + } + } + } + let ident = &ast.ident; + let generics = &ast.generics; + let gen = quote! { + impl<'a> Codegen<'a> for #ident #generics { + fn codegen(&self, state: &mut CodegenState<'a>) { + match self { + #(Self::#varnames(x) => x.codegen(state),)* + } + } + } + }; + gen.into() +} diff --git a/native/libcst_derive/src/inflate.rs b/native/libcst_derive/src/inflate.rs new file mode 100644 index 000000000..6223a2565 --- /dev/null +++ b/native/libcst_derive/src/inflate.rs @@ -0,0 +1,63 @@ +use proc_macro::TokenStream; +use quote::{quote, quote_spanned}; +use syn::{self, spanned::Spanned, Data, DataEnum, DeriveInput, Fields, FieldsUnnamed}; + +pub(crate) fn impl_inflate(ast: &DeriveInput) -> TokenStream { + match &ast.data { + Data::Enum(e) => impl_inflate_enum(ast, e), + Data::Struct(s) => quote_spanned! { + s.struct_token.span() => + compile_error!("Struct type is not supported") + } + .into(), + Data::Union(u) => quote_spanned! { + u.union_token.span() => + compile_error!("Union type is not supported") + } + .into(), + } +} + +fn impl_inflate_enum(ast: &DeriveInput, e: &DataEnum) -> TokenStream { + let mut varnames = vec![]; + for var in e.variants.iter() { + match &var.fields { + Fields::Named(n) => { + return quote_spanned! { + n.span() => + compile_error!("Named enum fields not supported") + } + .into() + } + f @ Fields::Unit => { + return quote_spanned! { + f.span() => + compile_error!("Empty enum variants not supported") + } + .into() + } + Fields::Unnamed(FieldsUnnamed { unnamed, .. }) => { + if unnamed.len() > 1 { + return quote_spanned! { + unnamed.span() => + compile_error!("Multiple unnamed fields not supported") + } + .into(); + } + varnames.push(&var.ident); + } + } + } + let ident = &ast.ident; + let generics = &ast.generics; + let gen = quote! { + impl<'a> Inflate<'a> for #ident #generics { + fn inflate(mut self, config: & crate::tokenizer::whitespace_parser::Config<'a>) -> std::result::Result { + match self { + #(Self::#varnames(x) => Ok(Self::#varnames(x.inflate(config)?)),)* + } + } + } + }; + gen.into() +} diff --git a/native/libcst_derive/src/into_py.rs b/native/libcst_derive/src/into_py.rs new file mode 100644 index 000000000..26da5fca6 --- /dev/null +++ b/native/libcst_derive/src/into_py.rs @@ -0,0 +1,177 @@ +use proc_macro::TokenStream; +use quote::{format_ident, quote, quote_spanned, ToTokens}; +use syn::{ + spanned::Spanned, Attribute, Data, DataEnum, DataStruct, DeriveInput, Fields, FieldsNamed, + FieldsUnnamed, Type, TypePath, Visibility, +}; + +pub(crate) fn impl_into_py(ast: &DeriveInput) -> TokenStream { + match &ast.data { + Data::Enum(e) => impl_into_py_enum(ast, e), + Data::Struct(s) => impl_into_py_struct(ast, s), + Data::Union(u) => quote_spanned! { + u.union_token.span() => + compile_error!("Union type is not supported") + } + .into(), + } +} + +fn impl_into_py_enum(ast: &DeriveInput, e: &DataEnum) -> TokenStream { + let mut toks = vec![]; + for var in e.variants.iter() { + let varname = &var.ident; + match &var.fields { + Fields::Named(n) => { + let mut fieldnames = vec![]; + for field in n.named.iter() { + if has_attr(&field.attrs, "skip_py") { + continue; + } + fieldnames.push(field.ident.as_ref().unwrap()); + } + let kwargs_toks = fields_to_kwargs(&var.fields, true); + toks.push(quote! { + Self::#varname { #(#fieldnames,)* .. } => { + let libcst = pyo3::types::PyModule::import(py, "libcst").expect("libcst couldn't be imported"); + let kwargs = #kwargs_toks ; + libcst + .getattr(stringify!(#varname)) + .expect(stringify!(no #varname found in libcst)) + .call((), Some(kwargs)) + .expect(stringify!(conversion failed for #varname)) + .into() + } + }) + } + f @ Fields::Unit => { + return quote_spanned! { + f.span() => + compile_error!("Empty enum variants not supported") + } + .into() + } + Fields::Unnamed(_) => { + toks.push(quote! { + Self::#varname(x, ..) => x.into_py(py), + }); + } + } + } + let ident = &ast.ident; + let generics = &ast.generics; + let gen = quote! { + use pyo3::types::IntoPyDict as _; + #[automatically_derived] + impl#generics pyo3::conversion::IntoPy for #ident #generics { + fn into_py(self, py: pyo3::Python) -> pyo3::PyObject { + match self { + #(#toks)* + } + } + } + }; + gen.into() +} + +fn impl_into_py_struct(ast: &DeriveInput, e: &DataStruct) -> TokenStream { + let kwargs_toks = fields_to_kwargs(&e.fields, false); + let ident = &ast.ident; + let generics = &ast.generics; + let gen = quote! { + use pyo3::types::IntoPyDict as _; + #[automatically_derived] + impl#generics pyo3::conversion::IntoPy for #ident #generics { + fn into_py(self, py: pyo3::Python) -> pyo3::PyObject { + let libcst = pyo3::types::PyModule::import(py, "libcst").expect("libcst couldn't be imported"); + let kwargs = #kwargs_toks ; + libcst + .getattr(stringify!(#ident)) + .expect(stringify!(no #ident found in libcst)) + .call((), Some(kwargs)) + .expect(stringify!(conversion failed for #ident)) + .into() + } + } + }; + gen.into() +} + +fn fields_to_kwargs(fields: &Fields, is_enum: bool) -> quote::__private::TokenStream { + let mut empty_kwargs = false; + let mut py_varnames = vec![]; + let mut rust_varnames = vec![]; + let mut optional_py_varnames = vec![]; + let mut optional_rust_varnames = vec![]; + match &fields { + Fields::Named(FieldsNamed { named, .. }) => { + for field in named.iter() { + if has_attr(&field.attrs, "skip_py") { + continue; + } + if let Some(ident) = field.ident.as_ref() { + let include = if let Visibility::Public(_) = field.vis { + true + } else { + is_enum + }; + if include { + let pyname = format_ident!("{}", ident); + let rustname = if is_enum { + ident.to_token_stream() + } else { + quote! { self.#ident } + }; + if !has_attr(&field.attrs, "no_py_default") { + if let Type::Path(TypePath { path, .. }) = &field.ty { + if let Some(first) = path.segments.first() { + if first.ident == "Option" { + optional_py_varnames.push(pyname); + optional_rust_varnames.push(rustname); + continue; + } + } + } + } + py_varnames.push(pyname); + rust_varnames.push(rustname); + } + } + } + empty_kwargs = py_varnames.is_empty() && optional_py_varnames.is_empty(); + } + Fields::Unnamed(FieldsUnnamed { unnamed, .. }) => { + if unnamed.first().is_some() { + py_varnames.push(format_ident!("value")); + rust_varnames.push(quote! { self.0 }); + } else { + empty_kwargs = true; + } + } + Fields::Unit => { + empty_kwargs = true; + } + }; + let kwargs_pairs = quote! { + #(Some((stringify!(#py_varnames), #rust_varnames.into_py(py))),)* + }; + let optional_pairs = quote! { + #(#optional_rust_varnames.map(|x| (stringify!(#optional_py_varnames), x.into_py(py))),)* + }; + if empty_kwargs { + quote! { pyo3::types::PyDict::new(py) } + } else { + quote! { + [ #kwargs_pairs #optional_pairs ] + .iter() + .filter(|x| x.is_some()) + .map(|x| x.as_ref().unwrap()) + .collect::>() + .into_py_dict(py) + } + } +} + +fn has_attr(attrs: &[Attribute], name: &'static str) -> bool { + attrs.iter().any(|attr| attr.path.is_ident(name)) +} diff --git a/native/libcst_derive/src/lib.rs b/native/libcst_derive/src/lib.rs new file mode 100644 index 000000000..4a687becf --- /dev/null +++ b/native/libcst_derive/src/lib.rs @@ -0,0 +1,31 @@ +mod inflate; +use inflate::impl_inflate; +mod parenthesized_node; +use parenthesized_node::impl_parenthesized_node; +mod codegen; +use codegen::impl_codegen; +mod into_py; +use into_py::impl_into_py; + +use proc_macro::TokenStream; + +#[proc_macro_derive(Inflate)] +pub fn inflate_derive(input: TokenStream) -> TokenStream { + let ast = syn::parse(input).unwrap(); + impl_inflate(&ast) +} + +#[proc_macro_derive(ParenthesizedNode)] +pub fn parenthesized_node_derive(input: TokenStream) -> TokenStream { + impl_parenthesized_node(&syn::parse(input).unwrap()) +} + +#[proc_macro_derive(Codegen)] +pub fn parenthesized_node_codegen(input: TokenStream) -> TokenStream { + impl_codegen(&syn::parse(input).unwrap()) +} + +#[proc_macro_derive(IntoPy, attributes(skip_py, no_py_default))] +pub fn into_py(input: TokenStream) -> TokenStream { + impl_into_py(&syn::parse(input).unwrap()) +} diff --git a/native/libcst_derive/src/parenthesized_node.rs b/native/libcst_derive/src/parenthesized_node.rs new file mode 100644 index 000000000..52d2aab45 --- /dev/null +++ b/native/libcst_derive/src/parenthesized_node.rs @@ -0,0 +1,93 @@ +use proc_macro::TokenStream; +use quote::{quote, quote_spanned}; +use syn::{spanned::Spanned, Data, DataEnum, DeriveInput, Fields, FieldsUnnamed}; + +pub(crate) fn impl_parenthesized_node(ast: &DeriveInput) -> TokenStream { + match &ast.data { + Data::Enum(e) => impl_enum(ast, e), + Data::Struct(_) => impl_struct(ast), + Data::Union(u) => quote_spanned! { + u.union_token.span() => + compile_error!("Union type is not supported") + } + .into(), + } +} + +fn impl_struct(ast: &DeriveInput) -> TokenStream { + let ident = &ast.ident; + let generics = &ast.generics; + let gen = quote! { + impl<'a> ParenthesizedNode<'a> for #ident #generics { + fn lpar(&self) -> &Vec> { + &self.lpar + } + fn rpar(&self) -> &Vec> { + &self.rpar + } + fn with_parens(self, left: LeftParen<'a>, right: RightParen<'a>) -> Self { + let mut lpar = self.lpar; + let mut rpar = self.rpar; + lpar.insert(0, left); + rpar.push(right); + #[allow(clippy::needless_update)] + Self { lpar, rpar, ..self } + } + } + }; + gen.into() +} + +fn impl_enum(ast: &DeriveInput, e: &DataEnum) -> TokenStream { + let mut varnames = vec![]; + for var in e.variants.iter() { + match &var.fields { + Fields::Named(n) => { + return quote_spanned! { + n.span() => + compile_error!("Named enum fields not supported") + } + .into() + } + f @ Fields::Unit => { + return quote_spanned! { + f.span() => + compile_error!("Empty enum variants not supported") + } + .into() + } + Fields::Unnamed(FieldsUnnamed { unnamed, .. }) => { + if unnamed.len() > 1 { + return quote_spanned! { + unnamed.span() => + compile_error!("Multiple unnamed fields not supported") + } + .into(); + } + varnames.push(&var.ident); + } + } + } + let ident = &ast.ident; + let generics = &ast.generics; + let gen = quote! { + impl<'a> ParenthesizedNode<'a> for #ident #generics { + fn lpar(&self) -> &Vec> { + match self { + #(Self::#varnames(x) => x.lpar(),)* + } + } + fn rpar(&self) -> &Vec> { + match self { + #(Self::#varnames(x) => x.rpar(),)* + } + } + fn with_parens(self, left: LeftParen<'a>, right: RightParen<'a>) -> Self { + match self { + #(Self::#varnames(x) => Self::#varnames(x.with_parens(left, right)),)* + } + } + } + }; + gen.into() +} diff --git a/native/roundtrip.sh b/native/roundtrip.sh new file mode 100755 index 000000000..3f732143c --- /dev/null +++ b/native/roundtrip.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +PARSE=$(dirname $0)/target/release/parse + +exec diff -u "$1" <($PARSE < "$1") diff --git a/pyproject.toml b/pyproject.toml index c9a93f4de..3f372901c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,9 @@ [tool.black] target-version = ["py36"] +exclude = "native/.*" + +[tool.ufmt] +excludes = ["native/", "stubs/"] + +[build-system] +requires = ["setuptools", "wheel", "setuptools-rust"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 5bf3217bd..9223d3a35 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,11 +6,13 @@ git+https://github.com/jimmylai/sphinx.git@slots_type_annotation hypothesis>=4.36.0 hypothesmith>=0.0.4 jupyter>=1.0.0 +maturin>=0.8.3,<0.9 nbsphinx>=0.4.2 prompt-toolkit>=2.0.9 pyre-check==0.9.3 setuptools_scm>=6.0.1 sphinx-rtd-theme>=0.4.3 tox>=3.18.1 -ufmt==1.2 +ufmt==1.3 usort==0.6.3 +setuptools-rust>=0.12.1 diff --git a/setup.py b/setup.py index 593acd7b0..4e35ca687 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,8 @@ from os import path, environ import setuptools +from setuptools_rust import Binding, RustExtension + # Grab the readme so that our package stays in sync with github. this_directory: str = path.abspath(path.dirname(__file__)) @@ -50,6 +52,13 @@ def no_local_scheme(version: str) -> str: if "=" in dep ], }, + rust_extensions=[ + RustExtension( + "libcst.native", + path="native/libcst/Cargo.toml", + binding=Binding.PyO3, + ) + ], classifiers=[ "License :: OSI Approved :: MIT License", "Topic :: Software Development :: Libraries", diff --git a/stubs/libcst_native/parser_config.pyi b/stubs/libcst_native/parser_config.pyi new file mode 100644 index 000000000..0165df79e --- /dev/null +++ b/stubs/libcst_native/parser_config.pyi @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, FrozenSet, Mapping, Sequence + +from libcst._parser.parso.utils import PythonVersionInfo + +class BaseWhitespaceParserConfig: + def __new__( + cls, + *, + lines: Sequence[str], + default_newline: str, + ) -> BaseWhitespaceParserConfig: ... + + lines: Sequence[str] + default_newline: str + + +class ParserConfig(BaseWhitespaceParserConfig): + def __new__( + cls, + *, + lines: Sequence[str], + encoding: str, + default_indent: str, + default_newline: str, + has_trailing_newline: bool, + version: PythonVersionInfo, + future_imports: FrozenSet[str], + ) -> BaseWhitespaceParserConfig: ... + + # lines is inherited + encoding: str + default_indent: str + # default_newline is inherited + has_trailing_newline: bool + version: PythonVersionInfo + future_imports: FrozenSet[str] + + +def parser_config_asdict(config: ParserConfig) -> Mapping[str, Any]: ... diff --git a/stubs/libcst_native/token_type.pyi b/stubs/libcst_native/token_type.pyi new file mode 100644 index 000000000..a0dd81790 --- /dev/null +++ b/stubs/libcst_native/token_type.pyi @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +class TokenType: + name: str + contains_syntax: bool + +STRING: TokenType = ... +NAME: TokenType = ... +NUMBER: TokenType = ... +OP: TokenType = ... +NEWLINE: TokenType = ... +INDENT: TokenType = ... +DEDENT: TokenType = ... +ASYNC: TokenType = ... +AWAIT: TokenType = ... +FSTRING_START: TokenType = ... +FSTRING_STRING: TokenType = ... +FSTRING_END: TokenType = ... +ENDMARKER: TokenType = ... +# unused dummy tokens for backwards compat with the parso tokenizer +ERRORTOKEN: TokenType = ... +ERROR_DEDENT: TokenType = ... diff --git a/stubs/libcst_native/tokenize.pyi b/stubs/libcst_native/tokenize.pyi new file mode 100644 index 000000000..bbcbeab01 --- /dev/null +++ b/stubs/libcst_native/tokenize.pyi @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Iterator, Optional, Tuple + +from libcst_native import token_type, whitespace_state + +class Token: + def __new__( + cls, + type: token_type.TokenType, + string: str, + start_pos: Tuple[int, int], + end_pos: Tuple[int, int], + whitespace_before: whitespace_state.WhitespaceState, + whitespace_after: whitespace_state.WhitespaceState, + relative_indent: Optional[str], + ) -> Token: ... + type: token_type.TokenType + string: str + start_pos: Tuple[int, int] + end_pos: Tuple[int, int] + whitespace_before: whitespace_state.WhitespaceState + whitespace_after: whitespace_state.WhitespaceState + relative_indent: Optional[str] + +def tokenize(text: str) -> Iterator[Token]: ... diff --git a/stubs/libcst_native/whitespace_parser.pyi b/stubs/libcst_native/whitespace_parser.pyi new file mode 100644 index 000000000..291f6dc6f --- /dev/null +++ b/stubs/libcst_native/whitespace_parser.pyi @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Sequence, Union + +from libcst._nodes.whitespace import ( + EmptyLine, + Newline, + ParenthesizedWhitespace, + SimpleWhitespace, + TrailingWhitespace, +) +from libcst._parser.types.config import BaseWhitespaceParserConfig as Config +from libcst._parser.types.whitespace_state import WhitespaceState as State + +def parse_simple_whitespace(config: Config, state: State) -> SimpleWhitespace: ... +def parse_empty_lines( + config: Config, + state: State, + *, + override_absolute_indent: Optional[str] = None, +) -> Sequence[EmptyLine]: ... +def parse_trailing_whitespace(config: Config, state: State) -> TrailingWhitespace: ... +def parse_parenthesizable_whitespace( + config: Config, state: State +) -> Union[SimpleWhitespace, ParenthesizedWhitespace]: ... diff --git a/stubs/libcst_native/whitespace_state.pyi b/stubs/libcst_native/whitespace_state.pyi new file mode 100644 index 000000000..da43bd542 --- /dev/null +++ b/stubs/libcst_native/whitespace_state.pyi @@ -0,0 +1,15 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +class WhitespaceState: + def __new__( + cls, line: int, column: int, absolute_indent: str, is_parenthesized: bool + ) -> WhitespaceState: ... + + line: int # one-indexed (to match parso's behavior) + column: int # zero-indexed (to match parso's behavior) + # What to look for when executing `_parse_indent`. + absolute_indent: str + is_parenthesized: bool