fix app crashes when modified offset points at the middle of UTF-8 ch…

…ar sequence
rhysd · Dec 31, 2023 · f996fd4 · f996fd4
1 parent b437e01
commit f996fd4
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 8 deletions.
diff --git a/v2/src/markdown/parser.rs b/v2/src/markdown/parser.rs
@@ -68,13 +68,22 @@ impl MarkdownContent {
         Self { source, base_dir }
     }
 
-    pub fn modified_offset(&self, new: &Self) -> Option<usize> {
+    pub fn modified_utf8_offset(&self, new: &Self) -> Option<usize> {
         let (prev_source, new_source) = (&self.source, &new.source);
+        // Offset must be UTF-8 aware to split text tokens correctly. If finding modified byte offset on a byte-by-byte
+        // basis, the offset may point at the middle of UTF-8 character sequence.
+        // For example, when a text 'あ' is modified to 'い',
+        // - あ: 0xE3 0x81 0x82
+        // - い: 0xE3 0x81 0x84
+        // The first two bytes are the same. So the byte offset is 2 and it points at the middle of the sequence.
+        // `MarkdownParser` will try to split the text at this position and will crash.
+        //
+        // Note: Fiding the offset on a byte-by-byte basis and then find the char boundary by `str::is_char_boundary`
+        // may be faster
         prev_source
-            .as_bytes()
-            .iter()
-            .zip(new_source.as_bytes().iter())
-            .position(|(a, b)| a != b)
+            .char_indices()
+            .zip(new_source.chars())
+            .find_map(|((idx, a), b)| (a != b).then_some(idx))
             .or_else(|| {
                 let (prev_len, new_len) = (prev_source.len(), new_source.len());
                 (prev_len != new_len).then_some(cmp::min(prev_len, new_len))
@@ -379,10 +388,10 @@ impl<'a, W: Write, V: TextVisitor, T: TextTokenizer> RenderTreeEncoder<'a, W, V,
             self.out.write_all(b"}")
         } else {
             let i = offset - start;
-            self.text_tokens(&text[..i], range.start..offset)?;
+            self.text_tokens(&text[..i], start..offset)?;
             self.tag("modified")?;
             self.out.write_all(b"}")?;
-            self.text_tokens(&text[i..], offset..range.end)
+            self.text_tokens(&text[i..], offset..end)
         }
     }
 
@@ -1072,4 +1081,22 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn utf8_aware_byte_offset() {
+        for (before, after, expected) in [
+            ("あ", "い", Some(0)),
+            ("ああ", "あい", Some(3)),
+            ("", "あ", Some(0)),
+            ("あ", "", Some(0)),
+            ("あ", "あい", Some(3)),
+            ("あ", "あ", None),
+            ("", "", None),
+        ] {
+            let prev = MarkdownContent::new(before.into(), None);
+            let now = MarkdownContent::new(after.into(), None);
+            let offset = prev.modified_utf8_offset(&now);
+            assert_eq!(offset, expected, "{before:?}, {after:?}");
+        }
+    }
 }
diff --git a/v2/src/shiba.rs b/v2/src/shiba.rs
@@ -143,7 +143,7 @@ impl PreviewContent {
         let is_new = self.title != title;
         let new_content = MarkdownContent::new(source, path.parent());
         let prev_content = std::mem::replace(&mut self.content, new_content);
-        let offset = if is_new { None } else { prev_content.modified_offset(&self.content) };
+        let offset = if is_new { None } else { prev_content.modified_utf8_offset(&self.content) };
         log::debug!("Last modified offset: {:?}", offset);
 
         self.text = renderer.send_message_raw(MarkdownParser::new(&self.content, offset, ()))?;