Skip to content

Commit

Permalink
Catch errors during parsing -> UnreadableBlock
Browse files Browse the repository at this point in the history
Improved error recovery -- an error during parsing, or an unknown block
type, now results in an `UnreadableBlock` containing the data that could
not be read, so that parsing of other blocks can continue.
  • Loading branch information
ricklupton committed Sep 16, 2023
1 parent baa4c4e commit 74e359a
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 56 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ To convert rm files to other formats, you can use [rmc](https://github.com/rickl

## Changelog

### Unreleased

New features:
- Improved error recovery. An error during parsing, or an unknown block type,
results in an `UnreadableBlock` containing the data that could not be read, so
that parsing of other blocks can continue.

Other changes:
- The `value` attribute of scene item blocks, which was not being used, has been
removed.

### v0.4.0

Breaking changes:
Expand Down
66 changes: 52 additions & 14 deletions src/rmscene/scene_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from packaging.version import Version

from .tagged_block_common import CrdtId, LwwValue
from .tagged_block_reader import TaggedBlockReader
from .tagged_block_reader import TaggedBlockReader, MainBlockInfo
from .tagged_block_writer import TaggedBlockWriter
from .crdt_sequence import CrdtSequence, CrdtSequenceItem
from .scene_tree import SceneTree
Expand All @@ -43,6 +43,16 @@ def version_info(self, writer: TaggedBlockWriter) -> tuple[int, int]:
"""Return (min_version, current_version) to use when writing."""
return (1, 1)

def get_block_type(self) -> int:
"""Return block type for this block.
By default, returns the block's BLOCK_TYPE attribute, but this method
can be overriden if a single block subclass can handle multiple block
types.
"""
return self.BLOCK_TYPE

@classmethod
def lookup(cls, block_type: int) -> tp.Optional[tp.Type[Block]]:
if getattr(cls, "BLOCK_TYPE", None) == block_type:
Expand All @@ -52,15 +62,42 @@ def lookup(cls, block_type: int) -> tp.Optional[tp.Type[Block]]:
return match
return None

def write(self, writer: TaggedBlockWriter):
"""Write the block header and content to the stream."""
min_version, current_version = self.version_info(writer)
with writer.write_block(self.get_block_type(), min_version, current_version):
self.to_stream(writer)

@classmethod
@abstractmethod
def from_stream(cls, reader: TaggedBlockReader) -> Block:
"""Read content of block from stream."""
raise NotImplementedError()

@abstractmethod
def to_stream(self, writer: TaggedBlockWriter):
"""Write content of block to stream."""
raise NotImplementedError()


@dataclass
class UnreadableBlock(Block):
"""Represent a block which could not be read for some reason."""

error: str
data: bytes
info: MainBlockInfo

def get_block_type(self) -> int:
return self.info.block_type

@classmethod
def from_stream(cls, reader: TaggedBlockReader) -> Block:
raise NotImplementedError()

def to_stream(self, writer: TaggedBlockWriter):
writer.data.write_bytes(self.data)


@dataclass
class AuthorIdsBlock(Block):
Expand Down Expand Up @@ -702,14 +739,21 @@ def _read_blocks(stream: TaggedBlockReader) -> Iterator[Block]:

block_type = Block.lookup(block_info.block_type)
if block_type:
yield block_type.from_stream(stream)
try:
yield block_type.from_stream(stream)
except Exception as e:
_logger.warning("Error reading block: %s", e)
stream.data.data.seek(block_info.offset)
data = stream.data.read_bytes(block_info.size)
yield UnreadableBlock(str(e), data, block_info)
else:
_logger.error(
"Unknown block type %s. Skipping %d bytes.",
block_info.block_type,
block_info.size,
msg = (
f"Unknown block type {block_info.block_type}. "
f"Skipping {block_info.size} bytes."
)
stream.data.read_bytes(block_info.size)
_logger.warning(msg)
data = stream.data.read_bytes(block_info.size)
yield UnreadableBlock(msg, data, block_info)


def read_blocks(data: tp.BinaryIO) -> Iterator[Block]:
Expand All @@ -723,12 +767,6 @@ def read_blocks(data: tp.BinaryIO) -> Iterator[Block]:
yield from _read_blocks(stream)


def _write_block(writer: TaggedBlockWriter, block: Block):
min_version, current_version = block.version_info(writer)
with writer.write_block(block.BLOCK_TYPE, min_version, current_version):
block.to_stream(writer)


def write_blocks(
data: tp.BinaryIO, blocks: Iterable[Block], options: tp.Optional[dict] = None
):
Expand All @@ -740,7 +778,7 @@ def write_blocks(
stream = TaggedBlockWriter(data, options=options)
stream.write_header()
for block in blocks:
_write_block(stream, block)
block.write(stream)


def build_tree(tree: SceneTree, blocks: Iterable[Block]):
Expand Down
140 changes: 98 additions & 42 deletions tests/test_scene_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,29 @@
from io import BytesIO
from pathlib import Path
from uuid import UUID
from rmscene import read_blocks, write_blocks, LwwValue, TaggedBlockWriter, TaggedBlockReader
from rmscene import (
read_blocks,
write_blocks,
LwwValue,
TaggedBlockWriter,
TaggedBlockReader,
)
from rmscene.scene_stream import *
from rmscene.tagged_block_common import HEADER_V6
from rmscene.tagged_block_reader import MainBlockInfo
from rmscene.crdt_sequence import CrdtSequenceItem
from rmscene.scene_items import ParagraphStyle

import logging

logger = logging.getLogger(__name__)


DATA_PATH = Path(__file__).parent / "data"


def _hex_lines(b, n=32):
return [
b[i*n:(i+1)*n].hex()
for i in range(len(b) // n + 1)
]
return [b[i * n : (i + 1) * n].hex() for i in range(len(b) // n + 1)]


LINES_V2_FILES = [
Expand Down Expand Up @@ -80,22 +85,26 @@ def test_normal_ab():
RootTextBlock(
block_id=CrdtId(0, 0),
value=si.Text(
items=CrdtSequence([
CrdtSequenceItem(
item_id=CrdtId(1, 16),
left_id=CrdtId(0, 0),
right_id=CrdtId(0, 0),
deleted_length=0,
value="AB",
)
]),
items=CrdtSequence(
[
CrdtSequenceItem(
item_id=CrdtId(1, 16),
left_id=CrdtId(0, 0),
right_id=CrdtId(0, 0),
deleted_length=0,
value="AB",
)
]
),
styles={
CrdtId(0, 0): LwwValue(timestamp=CrdtId(1, 15), value=si.ParagraphStyle.PLAIN),
CrdtId(0, 0): LwwValue(
timestamp=CrdtId(1, 15), value=si.ParagraphStyle.PLAIN
),
},
pos_x=-468.0,
pos_y=234.0,
width=936.0,
)
),
),
TreeNodeBlock(
group=si.Group(node_id=CrdtId(0, 1)),
Expand All @@ -114,14 +123,16 @@ def test_normal_ab():
right_id=CrdtId(0, 0),
deleted_length=0,
value=CrdtId(0, 11),
)
),
),
]


def test_read_glyph_range():
with open(DATA_PATH / "Wikipedia_highlighted_p1.rm", "rb") as f:
result = [block for block in read_blocks(f) if isinstance(block, SceneGlyphItemBlock)]
result = [
block for block in read_blocks(f) if isinstance(block, SceneGlyphItemBlock)
]

assert result[0].item.value.text == "The reMarkable uses electronic paper"

Expand All @@ -130,8 +141,12 @@ def test_read_glyph_range():
"block",
[
AuthorIdsBlock(author_uuids={1: UUID("495ba59f-c943-2b5c-b455-3682f6948906")}),
AuthorIdsBlock(author_uuids={1: UUID("495ba59f-c943-2b5c-b455-3682f6948906"),
2: UUID("cd83324a-917f-11ed-bb7b-3c0754484e34")}),
AuthorIdsBlock(
author_uuids={
1: UUID("495ba59f-c943-2b5c-b455-3682f6948906"),
2: UUID("cd83324a-917f-11ed-bb7b-3c0754484e34"),
}
),
MigrationInfoBlock(migration_id=CrdtId(1, 1), is_device=True),
PageInfoBlock(
loads_count=3, merges_count=2, text_chars_count=3, text_lines_count=1
Expand All @@ -145,22 +160,26 @@ def test_read_glyph_range():
RootTextBlock(
block_id=CrdtId(0, 0),
value=si.Text(
items=CrdtSequence([
CrdtSequenceItem(
item_id=CrdtId(1, 16),
left_id=CrdtId(0, 0),
right_id=CrdtId(0, 0),
deleted_length=0,
value="AB",
)
]),
items=CrdtSequence(
[
CrdtSequenceItem(
item_id=CrdtId(1, 16),
left_id=CrdtId(0, 0),
right_id=CrdtId(0, 0),
deleted_length=0,
value="AB",
)
]
),
styles={
CrdtId(0, 0): LwwValue(timestamp=CrdtId(1, 15), value=si.ParagraphStyle.PLAIN),
CrdtId(0, 0): LwwValue(
timestamp=CrdtId(1, 15), value=si.ParagraphStyle.PLAIN
),
},
pos_x=-468.0,
pos_y=234.0,
width=936.0,
)
),
),
TreeNodeBlock(
group=si.Group(
Expand All @@ -176,7 +195,7 @@ def test_read_glyph_range():
right_id=CrdtId(0, 0),
deleted_length=0,
value=CrdtId(0, 11),
)
),
),
SceneGlyphItemBlock(
parent_id=CrdtId(0, 11),
Expand Down Expand Up @@ -206,7 +225,7 @@ def test_read_glyph_range():
],
),
),
)
),
],
)
def test_blocks_roundtrip(block):
Expand Down Expand Up @@ -261,32 +280,69 @@ def test_blocks_keep_unknown_data():
7f 010f
"""
buf = BytesIO(HEADER_V6 + bytes.fromhex(data_hex))
reader = TaggedBlockReader(buf)
block = next(read_blocks(buf))
assert isinstance(block, SceneLineItemBlock)
assert block.extra_data == bytes.fromhex("7f 010f")


def test_error_in_block_contained():
# First block will cause a parsing error at `0xff`. Second block should
# still be parsed.
data_hex = """
06000000 00010103
1f 0219
aa bbcc
05000000 00010100
1f 0219
21 01
"""
buf = BytesIO(HEADER_V6 + bytes.fromhex(data_hex))
blocks = list(read_blocks(buf))
assert blocks == [
UnreadableBlock(
error="Bad tag type 0xA at position 58",
data=bytes.fromhex("1f0219aabbcc"),
info=MainBlockInfo(
offset=51,
size=6,
extra_data=b"",
block_type=3,
min_version=1,
current_version=1,
),
),
MigrationInfoBlock(migration_id=CrdtId(0x02, 0x19), is_device=True),
]

# Check all data is preserved
buf2 = BytesIO()
# Version 3 to be consistent with the input data used here
write_blocks(buf2, blocks, options={"version": "3.0"})

assert buf2.getvalue() == buf.getvalue()


from hypothesis import given, strategies as st


crdt_id_strategy = st.builds(
CrdtId,
st.integers(min_value=0, max_value=2**8-1),
st.integers(min_value=0, max_value=2**64-1),
st.integers(min_value=0, max_value=2**8 - 1),
st.integers(min_value=0, max_value=2**64 - 1),
)
st.register_type_strategy(CrdtId, crdt_id_strategy)

author_ids_block_strategy = st.builds(
AuthorIdsBlock,
st.dictionaries(st.integers(min_value=0, max_value=65535), st.uuids())
st.dictionaries(st.integers(min_value=0, max_value=65535), st.uuids()),
)

block_strategy = st.one_of([
author_ids_block_strategy,
st.builds(MigrationInfoBlock),
])

block_strategy = st.one_of(
[
author_ids_block_strategy,
st.builds(MigrationInfoBlock),
]
)


@given(block_strategy)
Expand Down

0 comments on commit 74e359a

Please sign in to comment.