From 2107defdd7a630174ae51505aff1d3d98d3be4f9 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Fri, 10 Mar 2023 13:08:07 +0100 Subject: [PATCH 1/5] Two Vec UTF8 index --- .../src/source_code/locator.rs | 357 +++++++++++------- 1 file changed, 220 insertions(+), 137 deletions(-) diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs index ee6cf94e0501c..c2bbeaa7d55df 100644 --- a/crates/ruff_python_ast/src/source_code/locator.rs +++ b/crates/ruff_python_ast/src/source_code/locator.rs @@ -10,94 +10,6 @@ pub struct Locator<'a> { index: OnceCell, } -pub enum Index { - Ascii(Vec), - Utf8(Vec>), -} - -/// Compute the starting byte index of each line in ASCII source code. -fn index_ascii(contents: &str) -> Vec { - let mut index = Vec::with_capacity(48); - index.push(0); - let bytes = contents.as_bytes(); - for (i, byte) in bytes.iter().enumerate() { - if *byte == b'\n' { - index.push(i + 1); - } - } - index -} - -/// Compute the starting byte index of each character in UTF-8 source code. -fn index_utf8(contents: &str) -> Vec> { - let mut index = Vec::with_capacity(48); - let mut current_row = Vec::with_capacity(48); - let mut current_byte_offset = 0; - let mut previous_char = '\0'; - for char in contents.chars() { - // Skip BOM. - if previous_char == '\0' && char == '\u{feff}' { - current_byte_offset += char.len_utf8(); - continue; - } - - current_row.push(current_byte_offset); - if char == '\n' { - if previous_char == '\r' { - current_row.pop(); - } - index.push(current_row); - current_row = Vec::with_capacity(48); - } - current_byte_offset += char.len_utf8(); - previous_char = char; - } - index.push(current_row); - index -} - -/// Compute the starting byte index of each line in source code. -pub fn index(contents: &str) -> Index { - if contents.is_ascii() { - Index::Ascii(index_ascii(contents)) - } else { - Index::Utf8(index_utf8(contents)) - } -} - -/// Truncate a [`Location`] to a byte offset in ASCII source code. -fn truncate_ascii(location: Location, index: &[usize], contents: &str) -> usize { - if location.row() - 1 == index.len() && location.column() == 0 - || (!index.is_empty() - && location.row() - 1 == index.len() - 1 - && index[location.row() - 1] + location.column() >= contents.len()) - { - contents.len() - } else { - index[location.row() - 1] + location.column() - } -} - -/// Truncate a [`Location`] to a byte offset in UTF-8 source code. -fn truncate_utf8(location: Location, index: &[Vec], contents: &str) -> usize { - if (location.row() - 1 == index.len() && location.column() == 0) - || (location.row() - 1 == index.len() - 1 - && location.column() == index[location.row() - 1].len()) - { - contents.len() - } else { - index[location.row() - 1][location.column()] - } -} - -/// Truncate a [`Location`] to a byte offset in source code. -fn truncate(location: Location, index: &Index, contents: &str) -> usize { - match index { - Index::Ascii(index) => truncate_ascii(location, index, contents), - Index::Utf8(index) => truncate_utf8(location, index, contents), - } -} - impl<'a> Locator<'a> { pub const fn new(contents: &'a str) -> Self { Self { @@ -107,20 +19,20 @@ impl<'a> Locator<'a> { } fn get_or_init_index(&self) -> &Index { - self.index.get_or_init(|| index(self.contents)) + self.index.get_or_init(|| Index::from_str(self.contents)) } /// Take the source code up to the given [`Location`]. pub fn take(&self, location: Location) -> &'a str { let index = self.get_or_init_index(); - let offset = truncate(location, index, self.contents); + let offset = index.byte_offset(location, self.contents); &self.contents[..offset] } /// Take the source code after the given [`Location`]. pub fn skip(&self, location: Location) -> &'a str { let index = self.get_or_init_index(); - let offset = truncate(location, index, self.contents); + let offset = index.byte_offset(location, self.contents); &self.contents[offset..] } @@ -128,15 +40,15 @@ impl<'a> Locator<'a> { pub fn slice>(&self, range: R) -> &'a str { let index = self.get_or_init_index(); let range = range.into(); - let start = truncate(range.location, index, self.contents); - let end = truncate(range.end_location, index, self.contents); + let start = index.byte_offset(range.location, self.contents); + let end = index.byte_offset(range.end_location, self.contents); &self.contents[start..end] } /// Return the byte offset of the given [`Location`]. pub fn offset(&self, location: Location) -> usize { let index = self.get_or_init_index(); - truncate(location, index, self.contents) + index.byte_offset(location, self.contents) } /// Return the underlying source code. @@ -153,33 +65,186 @@ impl<'a> Locator<'a> { } } +#[derive(Debug, Clone)] +enum Index { + Ascii(AsciiIndex), + Utf8(Utf8Index), +} + +impl Index { + /// Truncate a [`Location`] to a byte offset in source code. + fn byte_offset(&self, location: Location, contents: &str) -> usize { + match self { + Index::Ascii(ascii) => ascii.byte_offset(location, contents), + Index::Utf8(utf8) => utf8.byte_offset(location, contents), + } + } + + fn from_str(content: &str) -> Self { + let mut line_start_offsets: Vec = Vec::with_capacity(48); + line_start_offsets.push(0); + + for (i, byte) in content.bytes().enumerate() { + if !byte.is_ascii() { + return Index::Utf8(continue_non_ascii_content( + &content[i..], + i as u32, + line_start_offsets, + )); + } + if byte == b'\n' { + line_start_offsets.push((i + 1) as u32); + } + + continue; + } + + Self::Ascii(AsciiIndex::new(line_start_offsets)) + } +} + +impl From<&str> for Index { + fn from(value: &str) -> Self { + Self::from_str(value) + } +} + +#[derive(Debug, Clone, Eq, PartialEq)] +struct AsciiIndex { + line_start_byte_offsets: Vec, +} + +impl AsciiIndex { + fn new(line_start_positions: Vec) -> Self { + Self { + line_start_byte_offsets: line_start_positions, + } + } + + /// Truncate a [`Location`] to a byte offset in ASCII source code. + fn byte_offset(&self, location: Location, contents: &str) -> usize { + let index = &self.line_start_byte_offsets; + + // If start-of-line position after last line + if location.row() - 1 == index.len() && location.column() == 0 { + contents.len() + } else { + let byte_offset = index[location.row() - 1] as usize + location.column(); + byte_offset.min(contents.len()) + } + } +} + +fn continue_non_ascii_content(non_ascii: &str, mut offset: u32, mut lines: Vec) -> Utf8Index { + // Chars up to this point map 1:1 to byte offsets. + let mut chars_to_byte_offsets = Vec::new(); + chars_to_byte_offsets.extend((0..offset).map(|i| i as u32)); + let mut char_index = offset; + + // SKIP BOM + let contents = if offset == 0 && non_ascii.starts_with('\u{feff}') { + offset += '\u{feff}'.len_utf8() as u32; + &non_ascii[offset as usize..] + } else { + non_ascii + }; + + let mut after_carriage_return = false; + + for char in contents.chars() { + match char { + // Normalize `\r\n` to `\n` + '\n' if after_carriage_return => continue, + '\r' | '\n' => { + lines.push(char_index as u32 + 1); + } + _ => {} + } + + chars_to_byte_offsets.push(offset); + after_carriage_return = char == '\r'; + offset += char.len_utf8() as u32; + char_index += 1; + } + + Utf8Index::new(lines, chars_to_byte_offsets) +} + +#[derive(Debug, Clone, PartialEq)] +struct Utf8Index { + /// The index is the line number in the document. The value the character at which the the line starts + lines_to_characters: Vec, + + /// The index is the nth character in the document, the value the absolute byte offset. + character_to_byte_offsets: Vec, +} + +impl Utf8Index { + fn new(lines: Vec, characters: Vec) -> Self { + Self { + lines_to_characters: lines, + character_to_byte_offsets: characters, + } + } + + /// Truncate a [`Location`] to a byte offset in UTF-8 source code. + fn byte_offset(&self, location: Location, contents: &str) -> usize { + if location.row() - 1 == self.lines_to_characters.len() && location.column() == 0 { + contents.len() + } else { + let line_start = self.lines_to_characters[location.row() - 1]; + + match self + .character_to_byte_offsets + .get(line_start as usize + location.column()) + { + Some(offset) => *offset as usize, + None => contents.len(), + } + } + } +} + #[cfg(test)] mod tests { + use crate::source_code::locator::{AsciiIndex, Index, Utf8Index}; use rustpython_parser::ast::Location; - use super::{index_ascii, index_utf8, truncate_ascii, truncate_utf8}; + fn index_ascii(content: &str) -> AsciiIndex { + match Index::from_str(content) { + Index::Ascii(ascii) => ascii, + Index::Utf8(_) => panic!("Expected ASCII index"), + } + } + + fn index_utf8(content: &str) -> Utf8Index { + match Index::from_str(content) { + Index::Utf8(utf8) => utf8, + Index::Ascii(_) => panic!("Expected UTF8 Index"), + } + } #[test] fn ascii_index() { let contents = ""; let index = index_ascii(contents); - assert_eq!(index, [0]); + assert_eq!(index, AsciiIndex::new(vec![0])); let contents = "x = 1"; let index = index_ascii(contents); - assert_eq!(index, [0]); + assert_eq!(index, AsciiIndex::new(vec![0])); let contents = "x = 1\n"; let index = index_ascii(contents); - assert_eq!(index, [0, 6]); + assert_eq!(index, AsciiIndex::new(vec![0, 6])); let contents = "x = 1\r\n"; let index = index_ascii(contents); - assert_eq!(index, [0, 7]); + assert_eq!(index, AsciiIndex::new(vec![0, 7])); let contents = "x = 1\ny = 2\nz = x + y\n"; let index = index_ascii(contents); - assert_eq!(index, [0, 6, 12, 22]); + assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22])); } #[test] @@ -188,81 +253,99 @@ mod tests { let index = index_ascii(contents); // First row. - let loc = truncate_ascii(Location::new(1, 0), &index, contents); + let loc = index.byte_offset(Location::new(1, 0), contents); assert_eq!(loc, 0); // Second row. - let loc = truncate_ascii(Location::new(2, 0), &index, contents); + let loc = index.byte_offset(Location::new(2, 0), contents); assert_eq!(loc, 6); // One-past-the-end. - let loc = truncate_ascii(Location::new(3, 0), &index, contents); + let loc = index.byte_offset(Location::new(3, 0), contents); assert_eq!(loc, 11); } + impl Utf8Index { + fn line_count(&self) -> usize { + self.lines_to_characters.len() + } + } + #[test] fn utf8_index() { - let contents = ""; - let index = index_utf8(contents); - assert_eq!(index.len(), 1); - assert_eq!(index[0], Vec::::new()); - - let contents = "x = 1"; + let contents = "x = '🫣'"; let index = index_utf8(contents); - assert_eq!(index.len(), 1); - assert_eq!(index[0], [0, 1, 2, 3, 4]); + assert_eq!(index.line_count(), 1); + assert_eq!(index, Utf8Index::new(vec![0], vec![0, 1, 2, 3, 4, 5, 9])); - let contents = "x = 1\n"; + let contents = "x = '🫣'\n"; let index = index_utf8(contents); - assert_eq!(index.len(), 2); - assert_eq!(index[0], [0, 1, 2, 3, 4, 5]); - assert_eq!(index[1], Vec::::new()); + assert_eq!(index.line_count(), 2); + assert_eq!( + index, + Utf8Index::new(vec![0, 8], vec![0, 1, 2, 3, 4, 5, 9, 10]) + ); - let contents = "x = 1\r\n"; + let contents = "x = '🫣'\r\n"; let index = index_utf8(contents); - assert_eq!(index.len(), 2); - assert_eq!(index[0], [0, 1, 2, 3, 4, 5]); - assert_eq!(index[1], Vec::::new()); + assert_eq!(index.line_count(), 2); + assert_eq!( + index, + Utf8Index::new(vec![0, 8], vec![0, 1, 2, 3, 4, 5, 9, 10]) + ); - let contents = "x = 1\ny = 2\nz = x + y\n"; + let contents = "x = '🫣'\ny = 2\nz = x + y\n"; let index = index_utf8(contents); - assert_eq!(index.len(), 4); - assert_eq!(index[0], [0, 1, 2, 3, 4, 5]); - assert_eq!(index[1], [6, 7, 8, 9, 10, 11]); - assert_eq!(index[2], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]); - assert_eq!(index[3], Vec::::new()); - - let contents = "# \u{4e9c}\nclass Foo:\n \"\"\".\"\"\""; + assert_eq!(index.line_count(), 4); + assert_eq!( + index, + Utf8Index::new( + vec![0, 8, 14, 24], + vec![ + 0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26 + ] + ) + ); + + let contents = "# 🫣\nclass Foo:\n \"\"\".\"\"\""; let index = index_utf8(contents); - assert_eq!(index.len(), 3); - assert_eq!(index[0], [0, 1, 2, 5]); - assert_eq!(index[1], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); - assert_eq!(index[2], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]); + assert_eq!(index.line_count(), 3); + assert_eq!( + index, + Utf8Index::new( + vec![0, 4, 15], + vec![ + 0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, + ] + ) + ); } #[test] - fn utf8_truncate() { + fn utf8_byte_offset() { let contents = "x = '☃'\ny = 2"; let index = index_utf8(contents); // First row. - let loc = truncate_utf8(Location::new(1, 0), &index, contents); + let loc = index.byte_offset(Location::new(1, 0), contents); assert_eq!(loc, 0); - let loc = truncate_utf8(Location::new(1, 5), &index, contents); + let loc = index.byte_offset(Location::new(1, 5), contents); assert_eq!(loc, 5); assert_eq!(&contents[loc..], "☃'\ny = 2"); - let loc = truncate_utf8(Location::new(1, 6), &index, contents); + let loc = index.byte_offset(Location::new(1, 6), contents); assert_eq!(loc, 8); assert_eq!(&contents[loc..], "'\ny = 2"); // Second row. - let loc = truncate_utf8(Location::new(2, 0), &index, contents); + let loc = index.byte_offset(Location::new(2, 0), contents); assert_eq!(loc, 10); // One-past-the-end. - let loc = truncate_utf8(Location::new(3, 0), &index, contents); + let loc = index.byte_offset(Location::new(3, 0), contents); assert_eq!(loc, 15); } } From 6150a4468c7a4b263332503c44166785fa271cfc Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Fri, 10 Mar 2023 14:58:11 +0100 Subject: [PATCH 2/5] Documentation --- .../src/source_code/locator.rs | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs index c2bbeaa7d55df..7f3eb280d68cc 100644 --- a/crates/ruff_python_ast/src/source_code/locator.rs +++ b/crates/ruff_python_ast/src/source_code/locator.rs @@ -65,9 +65,12 @@ impl<'a> Locator<'a> { } } +/// Index for fast [Location] to byte offset conversions. #[derive(Debug, Clone)] enum Index { + /// Optimized index for an ASCII only document Ascii(AsciiIndex), + /// Index for UTF8 documents Utf8(Utf8Index), } @@ -80,7 +83,12 @@ impl Index { } } + /// Builds the index for `content` + // Not an issue because of manual string length check + #[allow(clippy::cast_possible_truncation)] fn from_str(content: &str) -> Self { + assert!(u32::try_from(content.len()).is_ok()); + let mut line_start_offsets: Vec = Vec::with_capacity(48); line_start_offsets.push(0); @@ -109,6 +117,10 @@ impl From<&str> for Index { } } +/// Index for fast [Location] to byte offset conversions for ASCII documents. +/// +/// The index stores the byte offsets for every line. It computes the byte offset for a [Location] +/// by retrieving the line offset from its index and adding the column. #[derive(Debug, Clone, Eq, PartialEq)] struct AsciiIndex { line_start_byte_offsets: Vec, @@ -135,10 +147,12 @@ impl AsciiIndex { } } +// Not an issue because of manual string length check in `Index::from_str` +#[allow(clippy::cast_possible_truncation)] fn continue_non_ascii_content(non_ascii: &str, mut offset: u32, mut lines: Vec) -> Utf8Index { // Chars up to this point map 1:1 to byte offsets. let mut chars_to_byte_offsets = Vec::new(); - chars_to_byte_offsets.extend((0..offset).map(|i| i as u32)); + chars_to_byte_offsets.extend(0..offset); let mut char_index = offset; // SKIP BOM @@ -156,7 +170,7 @@ fn continue_non_ascii_content(non_ascii: &str, mut offset: u32, mut lines: Vec continue, '\r' | '\n' => { - lines.push(char_index as u32 + 1); + lines.push(char_index + 1); } _ => {} } @@ -170,12 +184,25 @@ fn continue_non_ascii_content(non_ascii: &str, mut offset: u32, mut lines: Vec, - /// The index is the nth character in the document, the value the absolute byte offset. + /// The index is the nth character in the document, the value the byte offset from the begining of the document. character_to_byte_offsets: Vec, } From 37fcecc51093bcaa2992401d3fb3880a2a36eca6 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Fri, 10 Mar 2023 14:26:29 +0100 Subject: [PATCH 3/5] Lazy compute offset --- .../src/source_code/locator.rs | 186 +++++++----------- 1 file changed, 71 insertions(+), 115 deletions(-) diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs index 7f3eb280d68cc..1e8b7e32de8d8 100644 --- a/crates/ruff_python_ast/src/source_code/locator.rs +++ b/crates/ruff_python_ast/src/source_code/locator.rs @@ -19,7 +19,7 @@ impl<'a> Locator<'a> { } fn get_or_init_index(&self) -> &Index { - self.index.get_or_init(|| Index::from_str(self.contents)) + self.index.get_or_init(|| Index::from(self.contents)) } /// Take the source code up to the given [`Location`]. @@ -70,6 +70,7 @@ impl<'a> Locator<'a> { enum Index { /// Optimized index for an ASCII only document Ascii(AsciiIndex), + /// Index for UTF8 documents Utf8(Utf8Index), } @@ -82,23 +83,20 @@ impl Index { Index::Utf8(utf8) => utf8.byte_offset(location, contents), } } +} - /// Builds the index for `content` - // Not an issue because of manual string length check - #[allow(clippy::cast_possible_truncation)] - fn from_str(content: &str) -> Self { - assert!(u32::try_from(content.len()).is_ok()); +impl From<&str> for Index { + fn from(contents: &str) -> Self { + assert!(u32::try_from(contents.len()).is_ok()); let mut line_start_offsets: Vec = Vec::with_capacity(48); line_start_offsets.push(0); - for (i, byte) in content.bytes().enumerate() { + // SAFE because of length assertion above + #[allow(clippy::cast_possible_truncation)] + for (i, byte) in contents.bytes().enumerate() { if !byte.is_ascii() { - return Index::Utf8(continue_non_ascii_content( - &content[i..], - i as u32, - line_start_offsets, - )); + return Index::Utf8(continue_utf8_index(&contents[i..], i, line_start_offsets)); } if byte == b'\n' { line_start_offsets.push((i + 1) as u32); @@ -111,10 +109,28 @@ impl Index { } } -impl From<&str> for Index { - fn from(value: &str) -> Self { - Self::from_str(value) +// SAFE because of length assertion in `Index::from(&str)` +#[allow(clippy::cast_possible_truncation)] +fn continue_utf8_index( + non_ascii_part: &str, + offset: usize, + line_start_offsets: Vec, +) -> Utf8Index { + let mut lines = line_start_offsets; + let mut chars = non_ascii_part.char_indices().peekable(); + + while let Some((position, char)) = chars.next() { + match char { + '\r' if matches!(chars.peek(), Some((_, '\n'))) => continue, + '\r' | '\n' => { + let absolute_offset = offset + position + 1; + lines.push(absolute_offset as u32); + } + _ => {} + } } + + Utf8Index::new(lines) } /// Index for fast [Location] to byte offset conversions for ASCII documents. @@ -147,87 +163,46 @@ impl AsciiIndex { } } -// Not an issue because of manual string length check in `Index::from_str` -#[allow(clippy::cast_possible_truncation)] -fn continue_non_ascii_content(non_ascii: &str, mut offset: u32, mut lines: Vec) -> Utf8Index { - // Chars up to this point map 1:1 to byte offsets. - let mut chars_to_byte_offsets = Vec::new(); - chars_to_byte_offsets.extend(0..offset); - let mut char_index = offset; - - // SKIP BOM - let contents = if offset == 0 && non_ascii.starts_with('\u{feff}') { - offset += '\u{feff}'.len_utf8() as u32; - &non_ascii[offset as usize..] - } else { - non_ascii - }; - - let mut after_carriage_return = false; - - for char in contents.chars() { - match char { - // Normalize `\r\n` to `\n` - '\n' if after_carriage_return => continue, - '\r' | '\n' => { - lines.push(char_index + 1); - } - _ => {} - } - - chars_to_byte_offsets.push(offset); - after_carriage_return = char == '\r'; - offset += char.len_utf8() as u32; - char_index += 1; - } - - Utf8Index::new(lines, chars_to_byte_offsets) -} - /// Index for fast [Location] to byte offset conversions for UTF8 documents. /// -/// The index stores two lookup tables: -/// * the character offsets for each line -/// * the byte offset for each character -/// -/// The byte offset of a [Location] can then be computed using -/// -/// ```ignore -/// // retrieving the start character on that line and add the column (character offset) -/// let char_offset = lines[location.row() - 1] + location.column(); -/// let byte_offset = char_to_byte_offsets[char_offset] -/// ``` +/// The index stores the byte offset of every line. The column offset is lazily computed by +/// adding the line start offset and then iterating to the `nth` character. #[derive(Debug, Clone, PartialEq)] struct Utf8Index { - /// The index is the line number in the document. The value the character at which the the line starts - lines_to_characters: Vec, - - /// The index is the nth character in the document, the value the byte offset from the begining of the document. - character_to_byte_offsets: Vec, + line_start_byte_offsets: Vec, } impl Utf8Index { - fn new(lines: Vec, characters: Vec) -> Self { + fn new(line_byte_positions: Vec) -> Self { Self { - lines_to_characters: lines, - character_to_byte_offsets: characters, + line_start_byte_offsets: line_byte_positions, } } /// Truncate a [`Location`] to a byte offset in UTF-8 source code. fn byte_offset(&self, location: Location, contents: &str) -> usize { - if location.row() - 1 == self.lines_to_characters.len() && location.column() == 0 { + let index = &self.line_start_byte_offsets; + + if location.row() - 1 == index.len() && location.column() == 0 { contents.len() } else { - let line_start = self.lines_to_characters[location.row() - 1]; - - match self - .character_to_byte_offsets - .get(line_start as usize + location.column()) - { - Some(offset) => *offset as usize, + // Casting is safe because the length of utf8 characters is always between 1-4 + #[allow(clippy::cast_possible_truncation)] + let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') { + '\u{feff}'.len_utf8() as u32 + } else { + index[location.row() - 1] + }; + + let rest = &contents[line_start as usize..]; + + let column_offset = match rest.char_indices().nth(location.column()) { + Some((offset, _)) => offset, None => contents.len(), - } + }; + + let offset = line_start as usize + column_offset; + offset.min(contents.len()) } } } @@ -238,16 +213,20 @@ mod tests { use rustpython_parser::ast::Location; fn index_ascii(content: &str) -> AsciiIndex { - match Index::from_str(content) { + match Index::from(content) { Index::Ascii(ascii) => ascii, - Index::Utf8(_) => panic!("Expected ASCII index"), + Index::Utf8(_) => { + panic!("Expected ASCII index") + } } } fn index_utf8(content: &str) -> Utf8Index { - match Index::from_str(content) { + match Index::from(content) { Index::Utf8(utf8) => utf8, - Index::Ascii(_) => panic!("Expected UTF8 Index"), + Index::Ascii(_) => { + panic!("Expected UTF8 index") + } } } @@ -275,7 +254,7 @@ mod tests { } #[test] - fn ascii_truncate() { + fn ascii_byte_offset() { let contents = "x = 1\ny = 2"; let index = index_ascii(contents); @@ -294,7 +273,7 @@ mod tests { impl Utf8Index { fn line_count(&self) -> usize { - self.lines_to_characters.len() + self.line_start_byte_offsets.len() } } @@ -303,57 +282,34 @@ mod tests { let contents = "x = '🫣'"; let index = index_utf8(contents); assert_eq!(index.line_count(), 1); - assert_eq!(index, Utf8Index::new(vec![0], vec![0, 1, 2, 3, 4, 5, 9])); + assert_eq!(index, Utf8Index::new(vec![0])); let contents = "x = '🫣'\n"; let index = index_utf8(contents); assert_eq!(index.line_count(), 2); - assert_eq!( - index, - Utf8Index::new(vec![0, 8], vec![0, 1, 2, 3, 4, 5, 9, 10]) - ); + assert_eq!(index, Utf8Index::new(vec![0, 11])); let contents = "x = '🫣'\r\n"; let index = index_utf8(contents); assert_eq!(index.line_count(), 2); - assert_eq!( - index, - Utf8Index::new(vec![0, 8], vec![0, 1, 2, 3, 4, 5, 9, 10]) - ); + assert_eq!(index, Utf8Index::new(vec![0, 12])); let contents = "x = '🫣'\ny = 2\nz = x + y\n"; let index = index_utf8(contents); assert_eq!(index.line_count(), 4); - assert_eq!( - index, - Utf8Index::new( - vec![0, 8, 14, 24], - vec![ - 0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26 - ] - ) - ); + assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27])); let contents = "# 🫣\nclass Foo:\n \"\"\".\"\"\""; let index = index_utf8(contents); assert_eq!(index.line_count(), 3); - assert_eq!( - index, - Utf8Index::new( - vec![0, 4, 15], - vec![ - 0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, - ] - ) - ); + assert_eq!(index, Utf8Index::new(vec![0, 7, 18])); } #[test] fn utf8_byte_offset() { let contents = "x = '☃'\ny = 2"; let index = index_utf8(contents); + assert_eq!(index, Utf8Index::new(vec![0, 10])); // First row. let loc = index.byte_offset(Location::new(1, 0), contents); From dc18ad0544f583398943f620fc7b47acff05e18e Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Sat, 11 Mar 2023 11:31:32 +0100 Subject: [PATCH 4/5] Correctly handle `\r` --- .../src/source_code/locator.rs | 52 ++++++++++++++----- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs index 1e8b7e32de8d8..8670f51f7a0ea 100644 --- a/crates/ruff_python_ast/src/source_code/locator.rs +++ b/crates/ruff_python_ast/src/source_code/locator.rs @@ -65,7 +65,7 @@ impl<'a> Locator<'a> { } } -/// Index for fast [Location] to byte offset conversions. +/// Index for fast [`Location`] to byte offset conversions. #[derive(Debug, Clone)] enum Index { /// Optimized index for an ASCII only document @@ -96,13 +96,17 @@ impl From<&str> for Index { #[allow(clippy::cast_possible_truncation)] for (i, byte) in contents.bytes().enumerate() { if !byte.is_ascii() { - return Index::Utf8(continue_utf8_index(&contents[i..], i, line_start_offsets)); - } - if byte == b'\n' { - line_start_offsets.push((i + 1) as u32); + return Self::Utf8(continue_utf8_index(&contents[i..], i, line_start_offsets)); } - continue; + match byte { + // Only track one line break for `\r\n`. + b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue, + b'\n' | b'\r' => { + line_start_offsets.push((i + 1) as u32); + } + _ => {} + } } Self::Ascii(AsciiIndex::new(line_start_offsets)) @@ -117,11 +121,11 @@ fn continue_utf8_index( line_start_offsets: Vec, ) -> Utf8Index { let mut lines = line_start_offsets; - let mut chars = non_ascii_part.char_indices().peekable(); - while let Some((position, char)) = chars.next() { + for (position, char) in non_ascii_part.char_indices() { match char { - '\r' if matches!(chars.peek(), Some((_, '\n'))) => continue, + // Only track `\n` for `\r\n` + '\r' if non_ascii_part.as_bytes().get(position + 1) == Some(&b'\n') => continue, '\r' | '\n' => { let absolute_offset = offset + position + 1; lines.push(absolute_offset as u32); @@ -133,9 +137,9 @@ fn continue_utf8_index( Utf8Index::new(lines) } -/// Index for fast [Location] to byte offset conversions for ASCII documents. +/// Index for fast [`Location`] to byte offset conversions for ASCII documents. /// -/// The index stores the byte offsets for every line. It computes the byte offset for a [Location] +/// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`] /// by retrieving the line offset from its index and adding the column. #[derive(Debug, Clone, Eq, PartialEq)] struct AsciiIndex { @@ -163,7 +167,7 @@ impl AsciiIndex { } } -/// Index for fast [Location] to byte offset conversions for UTF8 documents. +/// Index for fast [`Location`] to byte offset conversions for UTF8 documents. /// /// The index stores the byte offset of every line. The column offset is lazily computed by /// adding the line start offset and then iterating to the `nth` character. @@ -271,6 +275,17 @@ mod tests { assert_eq!(loc, 11); } + #[test] + fn ascii_carriage_return() { + let contents = "x = 4\ry = 3"; + let index = index_ascii(contents); + assert_eq!(index, AsciiIndex::new(vec![0, 6])); + + assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4); + assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6); + assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7); + } + impl Utf8Index { fn line_count(&self) -> usize { self.line_start_byte_offsets.len() @@ -305,6 +320,19 @@ mod tests { assert_eq!(index, Utf8Index::new(vec![0, 7, 18])); } + #[test] + fn utf8_carriage_return() { + let contents = "x = '🫣'\ry = 3"; + let index = index_utf8(contents); + assert_eq!(index.line_count(), 2); + assert_eq!(index, Utf8Index::new(vec![0, 11])); + + // Second ' + assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9); + assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11); + assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12); + } + #[test] fn utf8_byte_offset() { let contents = "x = '☃'\ny = 2"; From 0cfc408688434a081e193c2ba8c552857d9e0d32 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Sat, 11 Mar 2023 11:34:55 +0100 Subject: [PATCH 5/5] Add test for `\r\n` --- .../src/source_code/locator.rs | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/crates/ruff_python_ast/src/source_code/locator.rs b/crates/ruff_python_ast/src/source_code/locator.rs index 8670f51f7a0ea..035110f78ed67 100644 --- a/crates/ruff_python_ast/src/source_code/locator.rs +++ b/crates/ruff_python_ast/src/source_code/locator.rs @@ -248,10 +248,6 @@ mod tests { let index = index_ascii(contents); assert_eq!(index, AsciiIndex::new(vec![0, 6])); - let contents = "x = 1\r\n"; - let index = index_ascii(contents); - assert_eq!(index, AsciiIndex::new(vec![0, 7])); - let contents = "x = 1\ny = 2\nz = x + y\n"; let index = index_ascii(contents); assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22])); @@ -286,6 +282,17 @@ mod tests { assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7); } + #[test] + fn ascii_carriage_return_newline() { + let contents = "x = 4\r\ny = 3"; + let index = index_ascii(contents); + assert_eq!(index, AsciiIndex::new(vec![0, 7])); + + assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4); + assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7); + assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8); + } + impl Utf8Index { fn line_count(&self) -> usize { self.line_start_byte_offsets.len() @@ -304,11 +311,6 @@ mod tests { assert_eq!(index.line_count(), 2); assert_eq!(index, Utf8Index::new(vec![0, 11])); - let contents = "x = '🫣'\r\n"; - let index = index_utf8(contents); - assert_eq!(index.line_count(), 2); - assert_eq!(index, Utf8Index::new(vec![0, 12])); - let contents = "x = '🫣'\ny = 2\nz = x + y\n"; let index = index_utf8(contents); assert_eq!(index.line_count(), 4); @@ -333,6 +335,19 @@ mod tests { assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12); } + #[test] + fn utf8_carriage_return_newline() { + let contents = "x = '🫣'\r\ny = 3"; + let index = index_utf8(contents); + assert_eq!(index.line_count(), 2); + assert_eq!(index, Utf8Index::new(vec![0, 12])); + + // Second ' + assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9); + assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12); + assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13); + } + #[test] fn utf8_byte_offset() { let contents = "x = '☃'\ny = 2";