Skip to content

Commit

Permalink
Implement a Python PEG parser in Rust (#566)
Browse files Browse the repository at this point in the history
This massive PR implements an alternative Python parser that will allow LibCST to parse Python 3.10's new grammar features. The parser is implemented in Rust, but it's turned off by default through the `LIBCST_PARSER_TYPE` environment variable. Set it to `native` to enable. The PR also enables new CI steps that test just the Rust parser, as well as steps that produce binary wheels for a variety of CPython versions and platforms.

Note: this PR aims to be roughly feature-equivalent to the main branch, so it doesn't include new 3.10 syntax features. That will be addressed as a follow-up PR.

The new parser is implemented in the `native/` directory, and is organized into two rust crates: `libcst_derive` contains some macros to facilitate various features of CST nodes, and `libcst` contains the `parser` itself (including the Python grammar), a `tokenizer` implementation by @bgw, and a very basic representation of CST `nodes`. Parsing is done by
1. **tokenizing** the input utf-8 string (bytes are not supported at the Rust layer, they are converted to utf-8 strings by the python wrapper)
2. running the **PEG parser** on the tokenized input, which also captures certain anchor tokens in the resulting syntax tree
3. using the anchor tokens to **inflate** the syntax tree into a proper CST

Co-authored-by: Benjamin Woodruff <github@benjam.info>
  • Loading branch information
zsol and bgw authored Dec 21, 2021
1 parent 9d611f9 commit c02de9b
Show file tree
Hide file tree
Showing 120 changed files with 17,117 additions and 477 deletions.
11 changes: 11 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[target.x86_64-apple-darwin]
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]

[target.aarch64-apple-darwin]
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]
6 changes: 5 additions & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
root = true

[*.{py,pyi,toml,md}]
[*.{py,pyi,rs,toml,md}]
charset = "utf-8"
end_of_line = lf
indent_size = 4
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
max_line_length = 88

[*.rs]
# https://github.com/rust-dev-tools/fmt-rfcs/blob/master/guide/guide.md
max_line_length = 100
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ exclude =
.pyre,
__pycache__,
.tox,
native,

max-complexity = 12

3 changes: 3 additions & 0 deletions .github/workflows/.pyre_configuration.template
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
{
"exclude": [
".*\/native\/.*"
],
"source_directories": [
"."
],
Expand Down
95 changes: 89 additions & 6 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: [3.6, 3.7, 3.8, 3.9]
parser: [pure, native]
steps:
- uses: actions/checkout@v1
- uses: actions/setup-python@v2
Expand All @@ -48,8 +49,14 @@ jobs:
- name: Validate Dependencies
if: steps.cache.outputs.cache-hit != 'true'
run: exit 1
- if: ${{ matrix.parser == 'native' }}
uses: actions-rs/toolchain@v1
with:
toolchain: stable
- run: >-
echo LIBCST_PARSER_TYPE=${{ matrix.parser }} >> $GITHUB_ENV
- name: Run Tests
run: python -m unittest
run: python setup.py test

# Run linters
lint:
Expand Down Expand Up @@ -166,10 +173,37 @@ jobs:

# Build python package
build:
name: Build wheels on ${{ matrix.os }}/${{ matrix.vers }}
needs: setup
runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
include:
- vers: i686
os: ubuntu-20.04
# aarch64 seems to be stuck
# - vers: aarch64
# os: ubuntu-20.04
- vers: auto64
os: ubuntu-20.04
- vers: arm64
os: macos-10.15
- vers: auto64
os: macos-10.15
- vers: auto64
os: windows-2019
env:
SCCACHE_VERSION: 0.2.13
CIBW_BEFORE_ALL_LINUX: "curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain stable -y"
CIBW_BEFORE_ALL_MACOS: "rustup target add aarch64-apple-darwin x86_64-apple-darwin"
CIBW_BEFORE_ALL_WINDOWS: "rustup target add x86_64-pc-windows-msvc i686-pc-windows-msvc"
CIBW_ENVIRONMENT: 'PATH="$PATH:$HOME/.cargo/bin"'
CIBW_SKIP: "cp27-* cp34-* cp35-* pp* *-win32 *-win_arm64 *-musllinux_*"
CIBW_ARCHS: ${{ matrix.vers }}
CIBW_BUILD_VERBOSITY: 1
steps:
- uses: actions/checkout@v1
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.9
Expand All @@ -185,17 +219,66 @@ jobs:
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
run: >-
echo LIBCST_NO_LOCAL_SCHEME=1 >> $GITHUB_ENV
- name: Build a binary wheel and a source tarball
- name: Build wheels
uses: pypa/cibuildwheel@v2.3.1
- uses: actions/upload-artifact@v2
with:
path: wheelhouse/*.whl
- name: Build a source tarball
run: >-
python -m
build
--sdist
--wheel
--outdir dist/
--outdir wheelhouse/
- if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
name: Publish distribution 📦 to Test PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
repository_url: https://test.pypi.org/legacy/
packages_dir: wheelhouse/

# Test rust parts
native:
name: Rust unit tests
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
components: rustfmt, clippy
- uses: actions/setup-python@v2
with:
python-version: 3.9
- name: test
uses: actions-rs/cargo@v1
with:
command: test
args: --manifest-path=native/Cargo.toml
- name: clippy
uses: actions-rs/clippy-check@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
args: --manifest-path=native/Cargo.toml --all-features

rustfmt:
name: Rustfmt
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true
- run: rustup component add rustfmt
- uses: actions-rs/cargo@v1
with:
command: fmt
args: --all --manifest-path=native/Cargo.toml -- --check
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ libcst/_version.py
.hypothesis/
.pyre_configuration
.python-version
target/
6 changes: 4 additions & 2 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
All contributions towards LibCST are MIT licensed.

Some Python files have been taken from the standard library and are therefore
Some Python files have been derived from the standard library and are therefore
PSF licensed. Modifications on these files are dual licensed (both MIT and
PSF). These files are:

- libcst/_parser/base_parser.py
- libcst/_parser/parso/utils.py
- libcst/_parser/parso/pgen2/generator.py
- libcst/_parser/parso/pgen2/grammar_parser.py
- libcst/_parser/parso/python/token.py
- libcst/_parser/parso/python/py_token.py
- libcst/_parser/parso/python/tokenize.py
- libcst/_parser/parso/tests/test_fstring.py
- libcst/_parser/parso/tests/test_tokenize.py
- libcst/_parser/parso/tests/test_utils.py
- libcst_native/src/tokenize/core/mod.rs
- libcst_native/src/tokenize/core/string_types.rs

Some Python files have been taken from dataclasses and are therefore Apache
licensed. Modifications on these files are licensed under Apache 2.0 license.
Expand Down
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
include README.rst LICENSE CODE_OF_CONDUCT.md CONTRIBUTING.md requirements.txt requirements-dev.txt docs/source/*.rst libcst/py.typed

include native/Cargo.toml
recursive-include native *
3 changes: 3 additions & 0 deletions libcst/_nodes/tests/test_atom.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import libcst as cst
from libcst import parse_expression
from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as
from libcst._parser.entrypoints import is_native
from libcst.metadata import CodeRange
from libcst.testing.utils import data_provider

Expand Down Expand Up @@ -1120,6 +1121,8 @@ def test_invalid(self, **kwargs: Any) -> None:
)
)
def test_versions(self, **kwargs: Any) -> None:
if is_native() and not kwargs.get("expect_success", True):
self.skipTest("parse errors are disabled for native parser")
self.assert_parses(**kwargs)


Expand Down
3 changes: 3 additions & 0 deletions libcst/_nodes/tests/test_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import libcst as cst
from libcst import parse_expression
from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as
from libcst._parser.entrypoints import is_native
from libcst.metadata import CodeRange
from libcst.testing.utils import data_provider

Expand Down Expand Up @@ -187,4 +188,6 @@ def test_invalid(self, **kwargs: Any) -> None:
)
)
def test_versions(self, **kwargs: Any) -> None:
if is_native() and not kwargs.get("expect_success", True):
self.skipTest("parse errors are disabled for native parser")
self.assert_parses(**kwargs)
3 changes: 3 additions & 0 deletions libcst/_nodes/tests/test_funcdef.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import libcst as cst
from libcst import parse_statement
from libcst._nodes.tests.base import CSTNodeTest, DummyIndentedBlock, parse_statement_as
from libcst._parser.entrypoints import is_native
from libcst.metadata import CodeRange
from libcst.testing.utils import data_provider

Expand Down Expand Up @@ -2041,4 +2042,6 @@ def test_valid_38(self, node: cst.CSTNode, code: str) -> None:
)
)
def test_versions(self, **kwargs: Any) -> None:
if is_native() and not kwargs.get("expect_success", True):
self.skipTest("parse errors are disabled for native parser")
self.assert_parses(**kwargs)
4 changes: 3 additions & 1 deletion libcst/_nodes/tests/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,8 +617,10 @@ class ImportFromParseTest(CSTNodeTest):
),
cst.ImportAlias(cst.Name("baz"), comma=cst.Comma()),
),
lpar=cst.LeftParen(),
rpar=cst.RightParen(),
),
"code": "from foo import bar, baz,",
"code": "from foo import (bar, baz,)",
},
# Star import statement
{
Expand Down
3 changes: 3 additions & 0 deletions libcst/_nodes/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import libcst as cst
from libcst import parse_expression, parse_statement
from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as
from libcst._parser.entrypoints import is_native
from libcst.metadata import CodeRange
from libcst.testing.utils import data_provider

Expand Down Expand Up @@ -126,4 +127,6 @@ def test_invalid(
)
)
def test_versions(self, **kwargs: Any) -> None:
if is_native() and not kwargs.get("expect_success", True):
self.skipTest("parse errors are disabled for native parser")
self.assert_parses(**kwargs)
3 changes: 3 additions & 0 deletions libcst/_nodes/tests/test_matrix_multiply.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
parse_expression_as,
parse_statement_as,
)
from libcst._parser.entrypoints import is_native
from libcst.testing.utils import data_provider


Expand Down Expand Up @@ -69,4 +70,6 @@ def test_valid(self, **kwargs: Any) -> None:
)
)
def test_versions(self, **kwargs: Any) -> None:
if is_native() and not kwargs.get("expect_success", True):
self.skipTest("parse errors are disabled for native parser")
self.assert_parses(**kwargs)
8 changes: 7 additions & 1 deletion libcst/_nodes/tests/test_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import libcst as cst
from libcst import parse_module, parse_statement
from libcst._nodes.tests.base import CSTNodeTest
from libcst._parser.entrypoints import is_native
from libcst.metadata import CodeRange, MetadataWrapper, PositionProvider
from libcst.testing.utils import data_provider

Expand Down Expand Up @@ -83,6 +84,7 @@ def test_code_for_node(
"empty_program_with_newline": {
"code": "\n",
"expected": cst.Module([], has_trailing_newline=True),
"enabled_for_native": False,
},
"empty_program_with_comments": {
"code": "# some comment\n",
Expand Down Expand Up @@ -112,7 +114,11 @@ def test_code_for_node(
},
}
)
def test_parser(self, *, code: str, expected: cst.Module) -> None:
def test_parser(
self, *, code: str, expected: cst.Module, enabled_for_native: bool = True
) -> None:
if is_native() and not enabled_for_native:
self.skipTest("Disabled for native parser")
self.assertEqual(parse_module(code), expected)

@data_provider(
Expand Down
3 changes: 3 additions & 0 deletions libcst/_nodes/tests/test_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import libcst as cst
from libcst import parse_expression
from libcst._nodes.tests.base import CSTNodeTest, parse_expression_as
from libcst._parser.entrypoints import is_native
from libcst.testing.utils import data_provider


Expand Down Expand Up @@ -133,4 +134,6 @@ def test_invalid(
)
)
def test_versions(self, **kwargs: Any) -> None:
if is_native() and not kwargs.get("expect_success", True):
self.skipTest("parse errors are disabled for native parser")
self.assert_parses(**kwargs)
Loading

0 comments on commit c02de9b

Please sign in to comment.