diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs index ee6cf94e0501c..035110f78ed67 100644 --- a/crates/ruff_python_ast/src/source_code/locator.rs +++ b/crates/ruff_python_ast/src/source_code/locator.rs @@ -10,94 +10,6 @@ pub struct Locator<'a> { index: OnceCell, } -pub enum Index { - Ascii(Vec), - Utf8(Vec>), -} - -/// Compute the starting byte index of each line in ASCII source code. -fn index_ascii(contents: &str) -> Vec { - let mut index = Vec::with_capacity(48); - index.push(0); - let bytes = contents.as_bytes(); - for (i, byte) in bytes.iter().enumerate() { - if *byte == b'\n' { - index.push(i + 1); - } - } - index -} - -/// Compute the starting byte index of each character in UTF-8 source code. -fn index_utf8(contents: &str) -> Vec> { - let mut index = Vec::with_capacity(48); - let mut current_row = Vec::with_capacity(48); - let mut current_byte_offset = 0; - let mut previous_char = '\0'; - for char in contents.chars() { - // Skip BOM. - if previous_char == '\0' && char == '\u{feff}' { - current_byte_offset += char.len_utf8(); - continue; - } - - current_row.push(current_byte_offset); - if char == '\n' { - if previous_char == '\r' { - current_row.pop(); - } - index.push(current_row); - current_row = Vec::with_capacity(48); - } - current_byte_offset += char.len_utf8(); - previous_char = char; - } - index.push(current_row); - index -} - -/// Compute the starting byte index of each line in source code. -pub fn index(contents: &str) -> Index { - if contents.is_ascii() { - Index::Ascii(index_ascii(contents)) - } else { - Index::Utf8(index_utf8(contents)) - } -} - -/// Truncate a [`Location`] to a byte offset in ASCII source code. -fn truncate_ascii(location: Location, index: &[usize], contents: &str) -> usize { - if location.row() - 1 == index.len() && location.column() == 0 - || (!index.is_empty() - && location.row() - 1 == index.len() - 1 - && index[location.row() - 1] + location.column() >= contents.len()) - { - contents.len() - } else { - index[location.row() - 1] + location.column() - } -} - -/// Truncate a [`Location`] to a byte offset in UTF-8 source code. -fn truncate_utf8(location: Location, index: &[Vec], contents: &str) -> usize { - if (location.row() - 1 == index.len() && location.column() == 0) - || (location.row() - 1 == index.len() - 1 - && location.column() == index[location.row() - 1].len()) - { - contents.len() - } else { - index[location.row() - 1][location.column()] - } -} - -/// Truncate a [`Location`] to a byte offset in source code. -fn truncate(location: Location, index: &Index, contents: &str) -> usize { - match index { - Index::Ascii(index) => truncate_ascii(location, index, contents), - Index::Utf8(index) => truncate_utf8(location, index, contents), - } -} - impl<'a> Locator<'a> { pub const fn new(contents: &'a str) -> Self { Self { @@ -107,20 +19,20 @@ impl<'a> Locator<'a> { } fn get_or_init_index(&self) -> &Index { - self.index.get_or_init(|| index(self.contents)) + self.index.get_or_init(|| Index::from(self.contents)) } /// Take the source code up to the given [`Location`]. pub fn take(&self, location: Location) -> &'a str { let index = self.get_or_init_index(); - let offset = truncate(location, index, self.contents); + let offset = index.byte_offset(location, self.contents); &self.contents[..offset] } /// Take the source code after the given [`Location`]. pub fn skip(&self, location: Location) -> &'a str { let index = self.get_or_init_index(); - let offset = truncate(location, index, self.contents); + let offset = index.byte_offset(location, self.contents); &self.contents[offset..] } @@ -128,15 +40,15 @@ impl<'a> Locator<'a> { pub fn slice>(&self, range: R) -> &'a str { let index = self.get_or_init_index(); let range = range.into(); - let start = truncate(range.location, index, self.contents); - let end = truncate(range.end_location, index, self.contents); + let start = index.byte_offset(range.location, self.contents); + let end = index.byte_offset(range.end_location, self.contents); &self.contents[start..end] } /// Return the byte offset of the given [`Location`]. pub fn offset(&self, location: Location) -> usize { let index = self.get_or_init_index(); - truncate(location, index, self.contents) + index.byte_offset(location, self.contents) } /// Return the underlying source code. @@ -153,116 +65,313 @@ impl<'a> Locator<'a> { } } +/// Index for fast [`Location`] to byte offset conversions. +#[derive(Debug, Clone)] +enum Index { + /// Optimized index for an ASCII only document + Ascii(AsciiIndex), + + /// Index for UTF8 documents + Utf8(Utf8Index), +} + +impl Index { + /// Truncate a [`Location`] to a byte offset in source code. + fn byte_offset(&self, location: Location, contents: &str) -> usize { + match self { + Index::Ascii(ascii) => ascii.byte_offset(location, contents), + Index::Utf8(utf8) => utf8.byte_offset(location, contents), + } + } +} + +impl From<&str> for Index { + fn from(contents: &str) -> Self { + assert!(u32::try_from(contents.len()).is_ok()); + + let mut line_start_offsets: Vec = Vec::with_capacity(48); + line_start_offsets.push(0); + + // SAFE because of length assertion above + #[allow(clippy::cast_possible_truncation)] + for (i, byte) in contents.bytes().enumerate() { + if !byte.is_ascii() { + return Self::Utf8(continue_utf8_index(&contents[i..], i, line_start_offsets)); + } + + match byte { + // Only track one line break for `\r\n`. + b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue, + b'\n' | b'\r' => { + line_start_offsets.push((i + 1) as u32); + } + _ => {} + } + } + + Self::Ascii(AsciiIndex::new(line_start_offsets)) + } +} + +// SAFE because of length assertion in `Index::from(&str)` +#[allow(clippy::cast_possible_truncation)] +fn continue_utf8_index( + non_ascii_part: &str, + offset: usize, + line_start_offsets: Vec, +) -> Utf8Index { + let mut lines = line_start_offsets; + + for (position, char) in non_ascii_part.char_indices() { + match char { + // Only track `\n` for `\r\n` + '\r' if non_ascii_part.as_bytes().get(position + 1) == Some(&b'\n') => continue, + '\r' | '\n' => { + let absolute_offset = offset + position + 1; + lines.push(absolute_offset as u32); + } + _ => {} + } + } + + Utf8Index::new(lines) +} + +/// Index for fast [`Location`] to byte offset conversions for ASCII documents. +/// +/// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`] +/// by retrieving the line offset from its index and adding the column. +#[derive(Debug, Clone, Eq, PartialEq)] +struct AsciiIndex { + line_start_byte_offsets: Vec, +} + +impl AsciiIndex { + fn new(line_start_positions: Vec) -> Self { + Self { + line_start_byte_offsets: line_start_positions, + } + } + + /// Truncate a [`Location`] to a byte offset in ASCII source code. + fn byte_offset(&self, location: Location, contents: &str) -> usize { + let index = &self.line_start_byte_offsets; + + // If start-of-line position after last line + if location.row() - 1 == index.len() && location.column() == 0 { + contents.len() + } else { + let byte_offset = index[location.row() - 1] as usize + location.column(); + byte_offset.min(contents.len()) + } + } +} + +/// Index for fast [`Location`] to byte offset conversions for UTF8 documents. +/// +/// The index stores the byte offset of every line. The column offset is lazily computed by +/// adding the line start offset and then iterating to the `nth` character. +#[derive(Debug, Clone, PartialEq)] +struct Utf8Index { + line_start_byte_offsets: Vec, +} + +impl Utf8Index { + fn new(line_byte_positions: Vec) -> Self { + Self { + line_start_byte_offsets: line_byte_positions, + } + } + + /// Truncate a [`Location`] to a byte offset in UTF-8 source code. + fn byte_offset(&self, location: Location, contents: &str) -> usize { + let index = &self.line_start_byte_offsets; + + if location.row() - 1 == index.len() && location.column() == 0 { + contents.len() + } else { + // Casting is safe because the length of utf8 characters is always between 1-4 + #[allow(clippy::cast_possible_truncation)] + let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') { + '\u{feff}'.len_utf8() as u32 + } else { + index[location.row() - 1] + }; + + let rest = &contents[line_start as usize..]; + + let column_offset = match rest.char_indices().nth(location.column()) { + Some((offset, _)) => offset, + None => contents.len(), + }; + + let offset = line_start as usize + column_offset; + offset.min(contents.len()) + } + } +} + #[cfg(test)] mod tests { + use crate::source_code::locator::{AsciiIndex, Index, Utf8Index}; use rustpython_parser::ast::Location; - use super::{index_ascii, index_utf8, truncate_ascii, truncate_utf8}; + fn index_ascii(content: &str) -> AsciiIndex { + match Index::from(content) { + Index::Ascii(ascii) => ascii, + Index::Utf8(_) => { + panic!("Expected ASCII index") + } + } + } + + fn index_utf8(content: &str) -> Utf8Index { + match Index::from(content) { + Index::Utf8(utf8) => utf8, + Index::Ascii(_) => { + panic!("Expected UTF8 index") + } + } + } #[test] fn ascii_index() { let contents = ""; let index = index_ascii(contents); - assert_eq!(index, [0]); + assert_eq!(index, AsciiIndex::new(vec![0])); let contents = "x = 1"; let index = index_ascii(contents); - assert_eq!(index, [0]); + assert_eq!(index, AsciiIndex::new(vec![0])); let contents = "x = 1\n"; let index = index_ascii(contents); - assert_eq!(index, [0, 6]); - - let contents = "x = 1\r\n"; - let index = index_ascii(contents); - assert_eq!(index, [0, 7]); + assert_eq!(index, AsciiIndex::new(vec![0, 6])); let contents = "x = 1\ny = 2\nz = x + y\n"; let index = index_ascii(contents); - assert_eq!(index, [0, 6, 12, 22]); + assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22])); } #[test] - fn ascii_truncate() { + fn ascii_byte_offset() { let contents = "x = 1\ny = 2"; let index = index_ascii(contents); // First row. - let loc = truncate_ascii(Location::new(1, 0), &index, contents); + let loc = index.byte_offset(Location::new(1, 0), contents); assert_eq!(loc, 0); // Second row. - let loc = truncate_ascii(Location::new(2, 0), &index, contents); + let loc = index.byte_offset(Location::new(2, 0), contents); assert_eq!(loc, 6); // One-past-the-end. - let loc = truncate_ascii(Location::new(3, 0), &index, contents); + let loc = index.byte_offset(Location::new(3, 0), contents); assert_eq!(loc, 11); } + #[test] + fn ascii_carriage_return() { + let contents = "x = 4\ry = 3"; + let index = index_ascii(contents); + assert_eq!(index, AsciiIndex::new(vec![0, 6])); + + assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4); + assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6); + assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7); + } + + #[test] + fn ascii_carriage_return_newline() { + let contents = "x = 4\r\ny = 3"; + let index = index_ascii(contents); + assert_eq!(index, AsciiIndex::new(vec![0, 7])); + + assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4); + assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7); + assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8); + } + + impl Utf8Index { + fn line_count(&self) -> usize { + self.line_start_byte_offsets.len() + } + } + #[test] fn utf8_index() { - let contents = ""; + let contents = "x = '🫣'"; let index = index_utf8(contents); - assert_eq!(index.len(), 1); - assert_eq!(index[0], Vec::::new()); + assert_eq!(index.line_count(), 1); + assert_eq!(index, Utf8Index::new(vec![0])); - let contents = "x = 1"; + let contents = "x = '🫣'\n"; let index = index_utf8(contents); - assert_eq!(index.len(), 1); - assert_eq!(index[0], [0, 1, 2, 3, 4]); + assert_eq!(index.line_count(), 2); + assert_eq!(index, Utf8Index::new(vec![0, 11])); - let contents = "x = 1\n"; + let contents = "x = '🫣'\ny = 2\nz = x + y\n"; let index = index_utf8(contents); - assert_eq!(index.len(), 2); - assert_eq!(index[0], [0, 1, 2, 3, 4, 5]); - assert_eq!(index[1], Vec::::new()); + assert_eq!(index.line_count(), 4); + assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27])); - let contents = "x = 1\r\n"; + let contents = "# 🫣\nclass Foo:\n \"\"\".\"\"\""; let index = index_utf8(contents); - assert_eq!(index.len(), 2); - assert_eq!(index[0], [0, 1, 2, 3, 4, 5]); - assert_eq!(index[1], Vec::::new()); + assert_eq!(index.line_count(), 3); + assert_eq!(index, Utf8Index::new(vec![0, 7, 18])); + } - let contents = "x = 1\ny = 2\nz = x + y\n"; + #[test] + fn utf8_carriage_return() { + let contents = "x = '🫣'\ry = 3"; let index = index_utf8(contents); - assert_eq!(index.len(), 4); - assert_eq!(index[0], [0, 1, 2, 3, 4, 5]); - assert_eq!(index[1], [6, 7, 8, 9, 10, 11]); - assert_eq!(index[2], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]); - assert_eq!(index[3], Vec::::new()); + assert_eq!(index.line_count(), 2); + assert_eq!(index, Utf8Index::new(vec![0, 11])); + + // Second ' + assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9); + assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11); + assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12); + } - let contents = "# \u{4e9c}\nclass Foo:\n \"\"\".\"\"\""; + #[test] + fn utf8_carriage_return_newline() { + let contents = "x = '🫣'\r\ny = 3"; let index = index_utf8(contents); - assert_eq!(index.len(), 3); - assert_eq!(index[0], [0, 1, 2, 5]); - assert_eq!(index[1], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); - assert_eq!(index[2], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]); + assert_eq!(index.line_count(), 2); + assert_eq!(index, Utf8Index::new(vec![0, 12])); + + // Second ' + assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9); + assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12); + assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13); } #[test] - fn utf8_truncate() { + fn utf8_byte_offset() { let contents = "x = '☃'\ny = 2"; let index = index_utf8(contents); + assert_eq!(index, Utf8Index::new(vec![0, 10])); // First row. - let loc = truncate_utf8(Location::new(1, 0), &index, contents); + let loc = index.byte_offset(Location::new(1, 0), contents); assert_eq!(loc, 0); - let loc = truncate_utf8(Location::new(1, 5), &index, contents); + let loc = index.byte_offset(Location::new(1, 5), contents); assert_eq!(loc, 5); assert_eq!(&contents[loc..], "☃'\ny = 2"); - let loc = truncate_utf8(Location::new(1, 6), &index, contents); + let loc = index.byte_offset(Location::new(1, 6), contents); assert_eq!(loc, 8); assert_eq!(&contents[loc..], "'\ny = 2"); // Second row. - let loc = truncate_utf8(Location::new(2, 0), &index, contents); + let loc = index.byte_offset(Location::new(2, 0), contents); assert_eq!(loc, 10); // One-past-the-end. - let loc = truncate_utf8(Location::new(3, 0), &index, contents); + let loc = index.byte_offset(Location::new(3, 0), contents); assert_eq!(loc, 15); } }