Catch errors during parsing -> UnreadableBlock

Improved error recovery -- an error during parsing, or an unknown block type, now results in an `UnreadableBlock` containing the data that could not be read, so that parsing of other blocks can continue.
ricklupton · Sep 16, 2023 · 74e359a · 74e359a
1 parent baa4c4e
commit 74e359a
Show file tree

Hide file tree

Showing 3 changed files with 161 additions and 56 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,17 @@ To convert rm files to other formats, you can use [rmc](https://github.com/rickl
 
 ## Changelog
 
+### Unreleased
+
+New features:
+- Improved error recovery. An error during parsing, or an unknown block type,
+  results in an `UnreadableBlock` containing the data that could not be read, so
+  that parsing of other blocks can continue.
+
+Other changes:
+- The `value` attribute of scene item blocks, which was not being used, has been
+  removed.
+
 ### v0.4.0
 
 Breaking changes:

diff --git a/src/rmscene/scene_stream.py b/src/rmscene/scene_stream.py
@@ -17,7 +17,7 @@
 from packaging.version import Version
 
 from .tagged_block_common import CrdtId, LwwValue
-from .tagged_block_reader import TaggedBlockReader
+from .tagged_block_reader import TaggedBlockReader, MainBlockInfo
 from .tagged_block_writer import TaggedBlockWriter
 from .crdt_sequence import CrdtSequence, CrdtSequenceItem
 from .scene_tree import SceneTree
@@ -43,6 +43,16 @@ def version_info(self, writer: TaggedBlockWriter) -> tuple[int, int]:
         """Return (min_version, current_version) to use when writing."""
         return (1, 1)
 
+    def get_block_type(self) -> int:
+        """Return block type for this block.
+
+        By default, returns the block's BLOCK_TYPE attribute, but this method
+        can be overriden if a single block subclass can handle multiple block
+        types.
+
+        """
+        return self.BLOCK_TYPE
+
     @classmethod
     def lookup(cls, block_type: int) -> tp.Optional[tp.Type[Block]]:
         if getattr(cls, "BLOCK_TYPE", None) == block_type:
@@ -52,15 +62,42 @@ def lookup(cls, block_type: int) -> tp.Optional[tp.Type[Block]]:
                 return match
         return None
 
+    def write(self, writer: TaggedBlockWriter):
+        """Write the block header and content to the stream."""
+        min_version, current_version = self.version_info(writer)
+        with writer.write_block(self.get_block_type(), min_version, current_version):
+            self.to_stream(writer)
+
     @classmethod
     @abstractmethod
     def from_stream(cls, reader: TaggedBlockReader) -> Block:
+        """Read content of block from stream."""
         raise NotImplementedError()
 
     @abstractmethod
     def to_stream(self, writer: TaggedBlockWriter):
+        """Write content of block to stream."""
+        raise NotImplementedError()
+
+
+@dataclass
+class UnreadableBlock(Block):
+    """Represent a block which could not be read for some reason."""
+
+    error: str
+    data: bytes
+    info: MainBlockInfo
+
+    def get_block_type(self) -> int:
+        return self.info.block_type
+
+    @classmethod
+    def from_stream(cls, reader: TaggedBlockReader) -> Block:
         raise NotImplementedError()
 
+    def to_stream(self, writer: TaggedBlockWriter):
+        writer.data.write_bytes(self.data)
+
 
 @dataclass
 class AuthorIdsBlock(Block):
@@ -702,14 +739,21 @@ def _read_blocks(stream: TaggedBlockReader) -> Iterator[Block]:
 
             block_type = Block.lookup(block_info.block_type)
             if block_type:
-                yield block_type.from_stream(stream)
+                try:
+                    yield block_type.from_stream(stream)
+                except Exception as e:
+                    _logger.warning("Error reading block: %s", e)
+                    stream.data.data.seek(block_info.offset)
+                    data = stream.data.read_bytes(block_info.size)
+                    yield UnreadableBlock(str(e), data, block_info)
             else:
-                _logger.error(
-                    "Unknown block type %s. Skipping %d bytes.",
-                    block_info.block_type,
-                    block_info.size,
+                msg = (
+                    f"Unknown block type {block_info.block_type}. "
+                    f"Skipping {block_info.size} bytes."
                 )
-                stream.data.read_bytes(block_info.size)
+                _logger.warning(msg)
+                data = stream.data.read_bytes(block_info.size)
+                yield UnreadableBlock(msg, data, block_info)
 
 
 def read_blocks(data: tp.BinaryIO) -> Iterator[Block]:
@@ -723,12 +767,6 @@ def read_blocks(data: tp.BinaryIO) -> Iterator[Block]:
     yield from _read_blocks(stream)
 
 
-def _write_block(writer: TaggedBlockWriter, block: Block):
-    min_version, current_version = block.version_info(writer)
-    with writer.write_block(block.BLOCK_TYPE, min_version, current_version):
-        block.to_stream(writer)
-
-
 def write_blocks(
     data: tp.BinaryIO, blocks: Iterable[Block], options: tp.Optional[dict] = None
 ):
@@ -740,7 +778,7 @@ def write_blocks(
     stream = TaggedBlockWriter(data, options=options)
     stream.write_header()
     for block in blocks:
-        _write_block(stream, block)
+        block.write(stream)
 
 
 def build_tree(tree: SceneTree, blocks: Iterable[Block]):

diff --git a/tests/test_scene_stream.py b/tests/test_scene_stream.py
@@ -2,24 +2,29 @@
 from io import BytesIO
 from pathlib import Path
 from uuid import UUID
-from rmscene import read_blocks, write_blocks, LwwValue, TaggedBlockWriter, TaggedBlockReader
+from rmscene import (
+    read_blocks,
+    write_blocks,
+    LwwValue,
+    TaggedBlockWriter,
+    TaggedBlockReader,
+)
 from rmscene.scene_stream import *
 from rmscene.tagged_block_common import HEADER_V6
+from rmscene.tagged_block_reader import MainBlockInfo
 from rmscene.crdt_sequence import CrdtSequenceItem
 from rmscene.scene_items import ParagraphStyle
 
 import logging
+
 logger = logging.getLogger(__name__)
 
 
 DATA_PATH = Path(__file__).parent / "data"
 
 
 def _hex_lines(b, n=32):
-    return [
-        b[i*n:(i+1)*n].hex()
-        for i in range(len(b) // n + 1)
-    ]
+    return [b[i * n : (i + 1) * n].hex() for i in range(len(b) // n + 1)]
 
 
 LINES_V2_FILES = [
@@ -80,22 +85,26 @@ def test_normal_ab():
         RootTextBlock(
             block_id=CrdtId(0, 0),
             value=si.Text(
-                items=CrdtSequence([
-                    CrdtSequenceItem(
-                        item_id=CrdtId(1, 16),
-                        left_id=CrdtId(0, 0),
-                        right_id=CrdtId(0, 0),
-                        deleted_length=0,
-                        value="AB",
-                    )
-                ]),
+                items=CrdtSequence(
+                    [
+                        CrdtSequenceItem(
+                            item_id=CrdtId(1, 16),
+                            left_id=CrdtId(0, 0),
+                            right_id=CrdtId(0, 0),
+                            deleted_length=0,
+                            value="AB",
+                        )
+                    ]
+                ),
                 styles={
-                    CrdtId(0, 0): LwwValue(timestamp=CrdtId(1, 15), value=si.ParagraphStyle.PLAIN),
+                    CrdtId(0, 0): LwwValue(
+                        timestamp=CrdtId(1, 15), value=si.ParagraphStyle.PLAIN
+                    ),
                 },
                 pos_x=-468.0,
                 pos_y=234.0,
                 width=936.0,
-            )
+            ),
         ),
         TreeNodeBlock(
             group=si.Group(node_id=CrdtId(0, 1)),
@@ -114,14 +123,16 @@ def test_normal_ab():
                 right_id=CrdtId(0, 0),
                 deleted_length=0,
                 value=CrdtId(0, 11),
-            )
+            ),
         ),
     ]
 
 
 def test_read_glyph_range():
     with open(DATA_PATH / "Wikipedia_highlighted_p1.rm", "rb") as f:
-        result = [block for block in read_blocks(f) if isinstance(block, SceneGlyphItemBlock)]
+        result = [
+            block for block in read_blocks(f) if isinstance(block, SceneGlyphItemBlock)
+        ]
 
     assert result[0].item.value.text == "The reMarkable uses electronic paper"
 
@@ -130,8 +141,12 @@ def test_read_glyph_range():
     "block",
     [
         AuthorIdsBlock(author_uuids={1: UUID("495ba59f-c943-2b5c-b455-3682f6948906")}),
-        AuthorIdsBlock(author_uuids={1: UUID("495ba59f-c943-2b5c-b455-3682f6948906"),
-                                     2: UUID("cd83324a-917f-11ed-bb7b-3c0754484e34")}),
+        AuthorIdsBlock(
+            author_uuids={
+                1: UUID("495ba59f-c943-2b5c-b455-3682f6948906"),
+                2: UUID("cd83324a-917f-11ed-bb7b-3c0754484e34"),
+            }
+        ),
         MigrationInfoBlock(migration_id=CrdtId(1, 1), is_device=True),
         PageInfoBlock(
             loads_count=3, merges_count=2, text_chars_count=3, text_lines_count=1
@@ -145,22 +160,26 @@ def test_read_glyph_range():
         RootTextBlock(
             block_id=CrdtId(0, 0),
             value=si.Text(
-                items=CrdtSequence([
-                    CrdtSequenceItem(
-                        item_id=CrdtId(1, 16),
-                        left_id=CrdtId(0, 0),
-                        right_id=CrdtId(0, 0),
-                        deleted_length=0,
-                        value="AB",
-                    )
-                ]),
+                items=CrdtSequence(
+                    [
+                        CrdtSequenceItem(
+                            item_id=CrdtId(1, 16),
+                            left_id=CrdtId(0, 0),
+                            right_id=CrdtId(0, 0),
+                            deleted_length=0,
+                            value="AB",
+                        )
+                    ]
+                ),
                 styles={
-                    CrdtId(0, 0): LwwValue(timestamp=CrdtId(1, 15), value=si.ParagraphStyle.PLAIN),
+                    CrdtId(0, 0): LwwValue(
+                        timestamp=CrdtId(1, 15), value=si.ParagraphStyle.PLAIN
+                    ),
                 },
                 pos_x=-468.0,
                 pos_y=234.0,
                 width=936.0,
-            )
+            ),
         ),
         TreeNodeBlock(
             group=si.Group(
@@ -176,7 +195,7 @@ def test_read_glyph_range():
                 right_id=CrdtId(0, 0),
                 deleted_length=0,
                 value=CrdtId(0, 11),
-            )
+            ),
         ),
         SceneGlyphItemBlock(
             parent_id=CrdtId(0, 11),
@@ -206,7 +225,7 @@ def test_read_glyph_range():
                     ],
                 ),
             ),
-        )
+        ),
     ],
 )
 def test_blocks_roundtrip(block):
@@ -261,32 +280,69 @@ def test_blocks_keep_unknown_data():
        7f 010f
     """
     buf = BytesIO(HEADER_V6 + bytes.fromhex(data_hex))
-    reader = TaggedBlockReader(buf)
     block = next(read_blocks(buf))
     assert isinstance(block, SceneLineItemBlock)
     assert block.extra_data == bytes.fromhex("7f 010f")
 
 
+def test_error_in_block_contained():
+    # First block will cause a parsing error at `0xff`. Second block should
+    # still be parsed.
+    data_hex = """
+    06000000 00010103
+    1f 0219
+    aa bbcc
+    05000000 00010100
+    1f 0219
+    21 01
+    """
+    buf = BytesIO(HEADER_V6 + bytes.fromhex(data_hex))
+    blocks = list(read_blocks(buf))
+    assert blocks == [
+        UnreadableBlock(
+            error="Bad tag type 0xA at position 58",
+            data=bytes.fromhex("1f0219aabbcc"),
+            info=MainBlockInfo(
+                offset=51,
+                size=6,
+                extra_data=b"",
+                block_type=3,
+                min_version=1,
+                current_version=1,
+            ),
+        ),
+        MigrationInfoBlock(migration_id=CrdtId(0x02, 0x19), is_device=True),
+    ]
+
+    # Check all data is preserved
+    buf2 = BytesIO()
+    # Version 3 to be consistent with the input data used here
+    write_blocks(buf2, blocks, options={"version": "3.0"})
+
+    assert buf2.getvalue() == buf.getvalue()
+
+
 from hypothesis import given, strategies as st
 
 
 crdt_id_strategy = st.builds(
     CrdtId,
-    st.integers(min_value=0, max_value=2**8-1),
-    st.integers(min_value=0, max_value=2**64-1),
+    st.integers(min_value=0, max_value=2**8 - 1),
+    st.integers(min_value=0, max_value=2**64 - 1),
 )
 st.register_type_strategy(CrdtId, crdt_id_strategy)
 
 author_ids_block_strategy = st.builds(
     AuthorIdsBlock,
-    st.dictionaries(st.integers(min_value=0, max_value=65535), st.uuids())
+    st.dictionaries(st.integers(min_value=0, max_value=65535), st.uuids()),
 )
 
-block_strategy = st.one_of([
-    author_ids_block_strategy,
-    st.builds(MigrationInfoBlock),
-])
-
+block_strategy = st.one_of(
+    [
+        author_ids_block_strategy,
+        st.builds(MigrationInfoBlock),
+    ]
+)
 
 
 @given(block_strategy)