From 410f1579598d887c919a62661a7c77b49d70b5e4 Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Tue, 24 Oct 2023 20:13:57 -0500 Subject: [PATCH 1/4] perf(parser): use memchr for lexing comments We now use memchr on its SIMD-supported targets to improve the performance of lexing comments. --- Cargo.lock | 1 + crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/lexer.rs | 16 ++++++++++++++++ crates/ruff_python_parser/src/lexer/cursor.rs | 17 +++++++++++++++++ 4 files changed, 35 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index e4e5a1a51c02c..b5daeb5aba9b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2381,6 +2381,7 @@ dependencies = [ "itertools 0.11.0", "lalrpop", "lalrpop-util", + "memchr", "ruff_python_ast", "ruff_text_size", "rustc-hash", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index a5d4208623392..46a97eabd43c5 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -22,6 +22,7 @@ bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } lalrpop-util = { version = "0.20.0", default-features = false } +memchr = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } rustc-hash = { workspace = true } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 448a3e7b34681..d66dbb9534fdf 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -407,6 +407,22 @@ impl<'source> Lexer<'source> { #[cfg(debug_assertions)] debug_assert_eq!(self.cursor.previous(), '#'); + #[cfg(any( + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "wasm32" + ))] + { + let bytes = self.cursor.rest().as_bytes(); + let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len()); + self.cursor.skip_bytes(offset); + } + + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "wasm32" + )))] self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); Tok::Comment(self.token_text().to_string()) diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs index c026c88e9b7fb..91c7d30c53b05 100644 --- a/crates/ruff_python_parser/src/lexer/cursor.rs +++ b/crates/ruff_python_parser/src/lexer/cursor.rs @@ -127,4 +127,21 @@ impl<'a> Cursor<'a> { self.bump(); } } + + /// Skips the next `count` bytes. + /// + /// ## Panics + /// - If `count` is larger than the remaining bytes in the input stream. + /// - If `count` indexes into a multi-byte character. + pub(super) fn skip_bytes(&mut self, count: usize) { + #[cfg(debug_assertions)] + { + self.prev_char = self.chars.as_str()[..count] + .chars() + .next_back() + .unwrap_or('\0'); + } + + self.chars = self.chars.as_str()[count..].chars(); + } } From 5981046bc10b9070715bed2061a27073e5d2c042 Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Tue, 24 Oct 2023 23:35:16 -0500 Subject: [PATCH 2/4] always use memchr --- crates/ruff_python_parser/src/lexer.rs | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index d66dbb9534fdf..b4f3436d5aeab 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -407,23 +407,9 @@ impl<'source> Lexer<'source> { #[cfg(debug_assertions)] debug_assert_eq!(self.cursor.previous(), '#'); - #[cfg(any( - target_arch = "x86_64", - target_arch = "aarch64", - target_arch = "wasm32" - ))] - { - let bytes = self.cursor.rest().as_bytes(); - let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len()); - self.cursor.skip_bytes(offset); - } - - #[cfg(not(any( - target_arch = "x86_64", - target_arch = "aarch64", - target_arch = "wasm32" - )))] - self.cursor.eat_while(|c| !matches!(c, '\n' | '\r')); + let bytes = self.cursor.rest().as_bytes(); + let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len()); + self.cursor.skip_bytes(offset); Tok::Comment(self.token_text().to_string()) } From 0578954544a5ab23d5d8ae39fd899b67c1120d02 Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Wed, 25 Oct 2023 09:56:06 -0500 Subject: [PATCH 3/4] perf(parser): try using compact_str for comments This might improve performance here because we do not compare comments to values that often. Therefore, the memory locality might improve performance in certain cases as well. --- Cargo.lock | 23 +++++++++++++++++++++++ Cargo.toml | 8 +++++++- crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/lexer.rs | 2 +- crates/ruff_python_parser/src/token.rs | 3 ++- 5 files changed, 34 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5daeb5aba9b5..9a446e7e9c259 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,6 +246,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.0.83" @@ -444,6 +453,19 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "compact_str" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "ryu", + "static_assertions", +] + [[package]] name = "configparser" version = "3.0.2" @@ -2376,6 +2398,7 @@ version = "0.0.0" dependencies = [ "anyhow", "bitflags 2.4.0", + "compact_str", "insta", "is-macro", "itertools 0.11.0", diff --git a/Cargo.toml b/Cargo.toml index 80630fc0452f5..dbefd7c4f6217 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ bitflags = { version = "2.3.1" } chrono = { version = "0.4.31", default-features = false, features = ["clock"] } clap = { version = "4.4.6", features = ["derive"] } colored = { version = "2.0.0" } +compact_str = { version = "0.7.1" } filetime = { version = "0.2.20" } glob = { version = "0.3.1" } globset = { version = "0.4.10" } @@ -52,7 +53,12 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } unicode-ident = { version = "1.0.12" } unicode_names2 = { version = "1.2.0" } unicode-width = { version = "0.1.11" } -uuid = { version = "1.4.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] } +uuid = { version = "1.4.1", features = [ + "v4", + "fast-rng", + "macro-diagnostics", + "js", +] } wsl = { version = "0.1.0" } [profile.release] diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 46a97eabd43c5..8fb400333a8d5 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -18,6 +18,7 @@ ruff_python_ast = { path = "../ruff_python_ast" } ruff_text_size = { path = "../ruff_text_size" } anyhow = { workspace = true } +compact_str = { workspace = true } bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index b4f3436d5aeab..06fdaf799f62f 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -411,7 +411,7 @@ impl<'source> Lexer<'source> { let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len()); self.cursor.skip_bytes(offset); - Tok::Comment(self.token_text().to_string()) + Tok::Comment(self.token_text().into()) } /// Lex a single IPython escape command. diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index ac441395fffc2..604907505caf5 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -6,6 +6,7 @@ //! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h; use crate::Mode; +use compact_str::CompactString; use ruff_python_ast::{Int, IpyEscapeKind}; use ruff_text_size::TextSize; use std::fmt; @@ -66,7 +67,7 @@ pub enum Tok { kind: IpyEscapeKind, }, /// Token value for a comment. These are filtered out of the token stream prior to parsing. - Comment(String), + Comment(CompactString), /// Token value for a newline. Newline, /// Token value for a newline that is not a logical line break. These are filtered out of From 0d3717f683d11f75f831bbabd73f3803f1d89749 Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Wed, 25 Oct 2023 10:16:10 -0500 Subject: [PATCH 4/4] Revert "perf(parser): try using compact_str for comments" Using compact_str does not seem to be faster for us :( --- Cargo.lock | 23 ----------------------- Cargo.toml | 8 +------- crates/ruff_python_parser/Cargo.toml | 1 - crates/ruff_python_parser/src/lexer.rs | 2 +- crates/ruff_python_parser/src/token.rs | 3 +-- 5 files changed, 3 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9a446e7e9c259..b5daeb5aba9b5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,15 +246,6 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" -[[package]] -name = "castaway" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a17ed5635fc8536268e5d4de1e22e81ac34419e5f052d4d51f4e01dcc263fcc" -dependencies = [ - "rustversion", -] - [[package]] name = "cc" version = "1.0.83" @@ -453,19 +444,6 @@ dependencies = [ "windows-sys 0.48.0", ] -[[package]] -name = "compact_str" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f86b9c4c00838774a6d902ef931eff7470720c51d90c2e32cfe15dc304737b3f" -dependencies = [ - "castaway", - "cfg-if", - "itoa", - "ryu", - "static_assertions", -] - [[package]] name = "configparser" version = "3.0.2" @@ -2398,7 +2376,6 @@ version = "0.0.0" dependencies = [ "anyhow", "bitflags 2.4.0", - "compact_str", "insta", "is-macro", "itertools 0.11.0", diff --git a/Cargo.toml b/Cargo.toml index dbefd7c4f6217..80630fc0452f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,6 @@ bitflags = { version = "2.3.1" } chrono = { version = "0.4.31", default-features = false, features = ["clock"] } clap = { version = "4.4.6", features = ["derive"] } colored = { version = "2.0.0" } -compact_str = { version = "0.7.1" } filetime = { version = "0.2.20" } glob = { version = "0.3.1" } globset = { version = "0.4.10" } @@ -53,12 +52,7 @@ tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } unicode-ident = { version = "1.0.12" } unicode_names2 = { version = "1.2.0" } unicode-width = { version = "0.1.11" } -uuid = { version = "1.4.1", features = [ - "v4", - "fast-rng", - "macro-diagnostics", - "js", -] } +uuid = { version = "1.4.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] } wsl = { version = "0.1.0" } [profile.release] diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 8fb400333a8d5..46a97eabd43c5 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -18,7 +18,6 @@ ruff_python_ast = { path = "../ruff_python_ast" } ruff_text_size = { path = "../ruff_text_size" } anyhow = { workspace = true } -compact_str = { workspace = true } bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 06fdaf799f62f..b4f3436d5aeab 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -411,7 +411,7 @@ impl<'source> Lexer<'source> { let offset = memchr::memchr2(b'\n', b'\r', bytes).unwrap_or(bytes.len()); self.cursor.skip_bytes(offset); - Tok::Comment(self.token_text().into()) + Tok::Comment(self.token_text().to_string()) } /// Lex a single IPython escape command. diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index 604907505caf5..ac441395fffc2 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -6,7 +6,6 @@ //! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h; use crate::Mode; -use compact_str::CompactString; use ruff_python_ast::{Int, IpyEscapeKind}; use ruff_text_size::TextSize; use std::fmt; @@ -67,7 +66,7 @@ pub enum Tok { kind: IpyEscapeKind, }, /// Token value for a comment. These are filtered out of the token stream prior to parsing. - Comment(CompactString), + Comment(String), /// Token value for a newline. Newline, /// Token value for a newline that is not a logical line break. These are filtered out of