Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically redirect to articles with same checksum #86

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
- can be used to store a filepath and content read from it (not stored) """

import datetime
import hashlib
import pathlib
import re
import weakref
from typing import Any, Callable, Dict, Optional, Tuple, Union

Expand Down Expand Up @@ -104,6 +106,10 @@ def __init__(

self.workaround_nocancel = workaround_nocancel

self.autodedup_filters = []

self.dedup_items = dict()

def start(self):
super().__enter__()

Expand All @@ -119,6 +125,35 @@ def update_metadata(self, **kwargs):
for name, value in kwargs.items():
self.add_metadata(name, value)

def add_autodedup_filter(self, filter_regex: str):
self.autodedup_filters.append(re.compile(filter_regex))

def check_for_duplicate(
self,
path: str,
fpath: Optional[pathlib.Path] = None,
content: Optional[bytes] = None,
):
for dedup_filter in self.autodedup_filters:
if dedup_filter.match(path):
if content:
digest = hashlib.sha256(content).digest()
else:
sha256 = hashlib.sha256()
with open(fpath, "rb") as f:
while True:
data = f.read(65536) # lets read stuff in 64kb chunks!
if not data:
break
sha256.update(data)
digest = sha256.digest()

if digest in self.dedup_items:
return self.dedup_items[digest]
self.dedup_items[digest] = path
break
return None

def add_item_for(
self,
path: str,
Expand Down Expand Up @@ -151,6 +186,18 @@ def add_item_for(
if fpath is None and content is None:
raise ValueError("One of fpath or content is required")

duplicate_path = self.check_for_duplicate(
path=path, fpath=fpath, content=content
)
if duplicate_path:
self.add_redirect(
path=path,
target_path=duplicate_path,
title=title,
is_front=is_front,
)
return path

mimetype = mimetype_for(
path=path, content=content, fpath=fpath, mimetype=mimetype
)
Expand Down
35 changes: 35 additions & 0 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,41 @@ def test_noindexlanguage(tmp_path):
assert not reader.has_fulltext_index


def test_duplicatefiles(tmp_path, png_image, html_file):
fpath = tmp_path / "test.zim"

with open(png_image, "rb") as fh:
png_data = fh.read()

with Creator(fpath, "welcome", "") as creator:
creator.add_autodedup_filter(r"^images/.*$")
# add a file not matching filter patterns
creator.add_item_for("other_folder1/yahoo0.png", "Image1", fpath=png_image)
# add same file but first matching filter patterns => will be added as-is
creator.add_item_for("images/yahoo1.png", "Image1", fpath=png_image)
# add same file but second matching filter patterns
# => will be replaced by a redirect
creator.add_item_for("images/yahoo2.png", "Image2", fpath=png_image)
# add same file but not matching filter patterns => will be added as-is
creator.add_item_for("other_folder2/yahoo3.png", "Image1", fpath=png_image)
# add same file matching filter patterns but with content instead of fpath
# => will be replaced by a redirect
creator.add_item_for("images/yahoo4.png", "Image3", content=png_data)

reader = Archive(fpath)
# make sure we have our image
assert reader.get_item("images/yahoo1.png")
assert not reader.get_entry_by_path("images/yahoo1.png").is_redirect
assert reader.get_item("images/yahoo2.png")
assert reader.get_entry_by_path("images/yahoo2.png").is_redirect
assert reader.get_item("images/yahoo4.png")
assert reader.get_entry_by_path("images/yahoo4.png").is_redirect
assert reader.get_item("other_folder1/yahoo0.png")
assert not reader.get_entry_by_path("other_folder1/yahoo0.png").is_redirect
assert reader.get_item("other_folder2/yahoo3.png")
assert not reader.get_entry_by_path("other_folder2/yahoo3.png").is_redirect


def test_add_item_for(tmp_path):
fpath = tmp_path / "test.zim"
# test without mimetype
Expand Down