diff --git a/v2/src/markdown/parser.rs b/v2/src/markdown/parser.rs index 3dbcf9e..3bdecf2 100644 --- a/v2/src/markdown/parser.rs +++ b/v2/src/markdown/parser.rs @@ -68,13 +68,22 @@ impl MarkdownContent { Self { source, base_dir } } - pub fn modified_offset(&self, new: &Self) -> Option { + pub fn modified_utf8_offset(&self, new: &Self) -> Option { let (prev_source, new_source) = (&self.source, &new.source); + // Offset must be UTF-8 aware to split text tokens correctly. If finding modified byte offset on a byte-by-byte + // basis, the offset may point at the middle of UTF-8 character sequence. + // For example, when a text 'あ' is modified to 'い', + // - あ: 0xE3 0x81 0x82 + // - い: 0xE3 0x81 0x84 + // The first two bytes are the same. So the byte offset is 2 and it points at the middle of the sequence. + // `MarkdownParser` will try to split the text at this position and will crash. + // + // Note: Fiding the offset on a byte-by-byte basis and then find the char boundary by `str::is_char_boundary` + // may be faster prev_source - .as_bytes() - .iter() - .zip(new_source.as_bytes().iter()) - .position(|(a, b)| a != b) + .char_indices() + .zip(new_source.chars()) + .find_map(|((idx, a), b)| (a != b).then_some(idx)) .or_else(|| { let (prev_len, new_len) = (prev_source.len(), new_source.len()); (prev_len != new_len).then_some(cmp::min(prev_len, new_len)) @@ -379,10 +388,10 @@ impl<'a, W: Write, V: TextVisitor, T: TextTokenizer> RenderTreeEncoder<'a, W, V, self.out.write_all(b"}") } else { let i = offset - start; - self.text_tokens(&text[..i], range.start..offset)?; + self.text_tokens(&text[..i], start..offset)?; self.tag("modified")?; self.out.write_all(b"}")?; - self.text_tokens(&text[i..], offset..range.end) + self.text_tokens(&text[i..], offset..end) } } @@ -1072,4 +1081,22 @@ mod tests { } } } + + #[test] + fn utf8_aware_byte_offset() { + for (before, after, expected) in [ + ("あ", "い", Some(0)), + ("ああ", "あい", Some(3)), + ("", "あ", Some(0)), + ("あ", "", Some(0)), + ("あ", "あい", Some(3)), + ("あ", "あ", None), + ("", "", None), + ] { + let prev = MarkdownContent::new(before.into(), None); + let now = MarkdownContent::new(after.into(), None); + let offset = prev.modified_utf8_offset(&now); + assert_eq!(offset, expected, "{before:?}, {after:?}"); + } + } } diff --git a/v2/src/shiba.rs b/v2/src/shiba.rs index 15a9772..d55f54e 100644 --- a/v2/src/shiba.rs +++ b/v2/src/shiba.rs @@ -143,7 +143,7 @@ impl PreviewContent { let is_new = self.title != title; let new_content = MarkdownContent::new(source, path.parent()); let prev_content = std::mem::replace(&mut self.content, new_content); - let offset = if is_new { None } else { prev_content.modified_offset(&self.content) }; + let offset = if is_new { None } else { prev_content.modified_utf8_offset(&self.content) }; log::debug!("Last modified offset: {:?}", offset); self.text = renderer.send_message_raw(MarkdownParser::new(&self.content, offset, ()))?;