Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop disallowed control characters and strip blank characters #179

Merged
merged 1 commit into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- Add utility function to compute ZIM Tags #164, including deduplication #156
- Metadata does not automatically drops control characters #159

### Fixed

Expand Down
16 changes: 16 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import libzim.writer # pyright: ignore
import PIL.Image
import regex

from zimscraperlib import logger
from zimscraperlib.constants import (
Expand Down Expand Up @@ -65,6 +66,9 @@
re.MULTILINE | re.DOTALL,
)

# All control characters are disallowed in str metadata except \n, \r and \t
UNWANTED_CONTROL_CHARACTERS_REGEX = regex.compile(r"(?![\n\t\r])\p{C}")


def mimetype_for(
path: str,
Expand Down Expand Up @@ -250,6 +254,11 @@ def add_metadata(
content: str | bytes | datetime.date | datetime.datetime | Iterable[str],
mimetype: str = "text/plain;charset=UTF-8",
):
# drop control characters before passing them to libzim
if isinstance(content, str):
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
content = UNWANTED_CONTROL_CHARACTERS_REGEX.sub("", content).strip(
" \r\n\t"
)
if not self.disable_metadata_checks:
self.validate_metadata(name, content)
if name == "Date" and isinstance(content, (datetime.date, datetime.datetime)):
Expand Down Expand Up @@ -304,6 +313,13 @@ def config_metadata(
}
)
self._metadata.update(extras)
for metadata_key, metadata_value in self._metadata.items():
# drop control characters so that proper value is stored in memory and
# logged in DEBUG mode ; also strip blank characters
if isinstance(metadata_value, str):
self._metadata[metadata_key] = UNWANTED_CONTROL_CHARACTERS_REGEX.sub(
"", metadata_value
).strip(" \r\n\t")
return self

def config_dev_metadata(self, **extras: str):
Expand Down
51 changes: 51 additions & 0 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,57 @@ def test_config_metadata(tmp_path, png_image, tags):
assert reader.get_text_metadata("TestMetadata") == "Test Metadata"


def test_config_metadata_control_characters(tmp_path):
fpath = tmp_path / "test_config.zim"
creator = Creator(fpath, "").config_dev_metadata(
Description="\t\n\r\n \tA description \awith \bcontrol characters\v",
LongDescription="A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t",
Creator=" A creator ",
)
assert creator._metadata["Description"] == "A description with control characters"
assert (
creator._metadata["LongDescription"]
== "A description \rwith \ncontrol characters\tsss"
)
assert creator._metadata["Creator"] == "A creator"
with creator:
creator.add_metadata(
"Description_1",
"\t\n\r\n \tA description \awith \bcontrol characters\v",
)
creator.add_metadata(
"LongDescription_1",
"A description \rwith \a\ncontrol characters\tsss\t\n\r\n \t",
)
creator.add_metadata(
"Creator_1",
" A creator ",
)
pass

assert fpath.exists()

reader = Archive(fpath)
assert (
reader.get_text_metadata("Description")
== "A description with control characters"
)
assert (
reader.get_text_metadata("LongDescription")
== "A description \rwith \ncontrol characters\tsss"
)
assert reader.get_text_metadata("Creator") == "A creator"
assert (
reader.get_text_metadata("Description_1")
== "A description with control characters"
)
assert (
reader.get_text_metadata("LongDescription_1")
== "A description \rwith \ncontrol characters\tsss"
)
assert reader.get_text_metadata("Creator_1") == "A creator"


@pytest.mark.parametrize(
"name,value,valid",
[
Expand Down
Loading