From 2c894a972cf06f8a4384d2ce1d02c13917e10f7c Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 10 Jul 2024 14:31:01 +0000 Subject: [PATCH] Drop disallowed control characters and strip blank characters --- CHANGELOG.md | 1 + src/zimscraperlib/zim/creator.py | 16 ++++++++++ tests/zim/test_zim_creator.py | 51 ++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e770241d..182f53c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Add utility function to compute ZIM Tags #164, including deduplication #156 +- Metadata does not automatically drops control characters #159 ### Fixed diff --git a/src/zimscraperlib/zim/creator.py b/src/zimscraperlib/zim/creator.py index 3931d8f0..9198980a 100644 --- a/src/zimscraperlib/zim/creator.py +++ b/src/zimscraperlib/zim/creator.py @@ -30,6 +30,7 @@ import libzim.writer # pyright: ignore import PIL.Image +import regex from zimscraperlib import logger from zimscraperlib.constants import ( @@ -65,6 +66,9 @@ re.MULTILINE | re.DOTALL, ) +# All control characters are disallowed in str metadata except \n, \r and \t +UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}") + def mimetype_for( path: str, @@ -250,6 +254,11 @@ def add_metadata( content: str | bytes | datetime.date | datetime.datetime | Iterable[str], mimetype: str = "text/plain;charset=UTF-8", ): + # drop control characters before passing them to libzim + if isinstance(content, str): + content = UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", content).strip( + " \r\n\t" + ) if not self.disable_metadata_checks: self.validate_metadata(name, content) if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)): @@ -304,6 +313,13 @@ def config_metadata( } ) self._metadata.update(extras) + for metadata_key, metadata_value in self._metadata.items(): + # drop control characters so that proper value is stored in memory and + # logged in DEBUG mode ; also strip blank characters + if isinstance(metadata_value, str): + self._metadata[metadata_key] = UNWANTED_CONTROL_CHARACTERS_REGEX.sub( + "", metadata_value + ).strip(" \r\n\t") return self def config_dev_metadata(self, **extras: str): diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index e2a5a685..b3ae0e7c 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -724,6 +724,57 @@ def test_config_metadata(tmp_path, png_image, tags): assert reader.get_text_metadata("TestMetadata") == "Test Metadata" +def test_config_metadata_control_characters(tmp_path): + fpath = tmp_path / "test_config.zim" + creator = Creator(fpath, "").config_dev_metadata( + Description="\t\n\r\n \tA description \awith \bcontrol characters\v", + LongDescription="A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t", + Creator=" A creator ", + ) + assert creator._metadata["Description"] == "A description with control characters" + assert ( + creator._metadata["LongDescription"] + == "A description \rwith \ncontrol characters\tsss" + ) + assert creator._metadata["Creator"] == "A creator" + with creator: + creator.add_metadata( + "Description_1", + "\t\n\r\n \tA description \awith \bcontrol characters\v", + ) + creator.add_metadata( + "LongDescription_1", + "A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t", + ) + creator.add_metadata( + "Creator_1", + " A creator ", + ) + pass + + assert fpath.exists() + + reader = Archive(fpath) + assert ( + reader.get_text_metadata("Description") + == "A description with control characters" + ) + assert ( + reader.get_text_metadata("LongDescription") + == "A description \rwith \ncontrol characters\tsss" + ) + assert reader.get_text_metadata("Creator") == "A creator" + assert ( + reader.get_text_metadata("Description_1") + == "A description with control characters" + ) + assert ( + reader.get_text_metadata("LongDescription_1") + == "A description \rwith \ncontrol characters\tsss" + ) + assert reader.get_text_metadata("Creator_1") == "A creator" + + @pytest.mark.parametrize( "name,value,valid", [