Skip to content

Commit

Permalink
Automatically redirect to articles with same checksum
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Apr 21, 2022
1 parent fe0b5fa commit a3ad5c5
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 0 deletions.
47 changes: 47 additions & 0 deletions src/zimscraperlib/zim/creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
- can be used to store a filepath and content read from it (not stored) """

import datetime
import hashlib
import pathlib
import re
import weakref
from typing import Any, Callable, Dict, Optional, Tuple, Union

Expand Down Expand Up @@ -104,6 +106,10 @@ def __init__(

self.workaround_nocancel = workaround_nocancel

self.autodedup_filters = []

self.dedup_items = dict()

def start(self):
super().__enter__()

Expand All @@ -119,6 +125,35 @@ def update_metadata(self, **kwargs):
for name, value in kwargs.items():
self.add_metadata(name, value)

def add_autodedup_filter(self, filter_regex: str):
self.autodedup_filters.append(re.compile(filter_regex))

def check_for_duplicate(
self,
path: str,
fpath: Optional[pathlib.Path] = None,
content: Optional[bytes] = None,
):
for dedup_filter in self.autodedup_filters:
if dedup_filter.match(path):
if content:
digest = hashlib.sha256(content).digest()
else:
sha256 = hashlib.sha256()
with open(fpath, "rb") as f:
while True:
data = f.read(65536) # lets read stuff in 64kb chunks!
if not data:
break
sha256.update(data)
digest = sha256.digest()

if digest in self.dedup_items:
return self.dedup_items[digest]
self.dedup_items[digest] = path
break
return None

def add_item_for(
self,
path: str,
Expand Down Expand Up @@ -151,6 +186,18 @@ def add_item_for(
if fpath is None and content is None:
raise ValueError("One of fpath or content is required")

duplicate_path = self.check_for_duplicate(
path=path, fpath=fpath, content=content
)
if duplicate_path:
self.add_redirect(
path=path,
target_path=duplicate_path,
title=title,
is_front=is_front,
)
return path

mimetype = mimetype_for(
path=path, content=content, fpath=fpath, mimetype=mimetype
)
Expand Down
35 changes: 35 additions & 0 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,41 @@ def test_noindexlanguage(tmp_path):
assert not reader.has_fulltext_index


def test_duplicatefiles(tmp_path, png_image, html_file):
fpath = tmp_path / "test.zim"

with open(png_image, "rb") as fh:
png_data = fh.read()

with Creator(fpath, "welcome", "") as creator:
creator.add_autodedup_filter(r"^images/.*$")
# add a file not matching filter patterns
creator.add_item_for("other_folder1/yahoo0.png", "Image1", fpath=png_image)
# add same file but first matching filter patterns => will be added as-is
creator.add_item_for("images/yahoo1.png", "Image1", fpath=png_image)
# add same file but second matching filter patterns
# => will be replaced by a redirect
creator.add_item_for("images/yahoo2.png", "Image2", fpath=png_image)
# add same file but not matching filter patterns => will be added as-is
creator.add_item_for("other_folder2/yahoo3.png", "Image1", fpath=png_image)
# add same file matching filter patterns but with content instead of fpath
# => will be replaced by a redirect
creator.add_item_for("images/yahoo4.png", "Image3", content=png_data)

reader = Archive(fpath)
# make sure we have our image
assert reader.get_item("images/yahoo1.png")
assert not reader.get_entry_by_path("images/yahoo1.png").is_redirect
assert reader.get_item("images/yahoo2.png")
assert reader.get_entry_by_path("images/yahoo2.png").is_redirect
assert reader.get_item("images/yahoo4.png")
assert reader.get_entry_by_path("images/yahoo4.png").is_redirect
assert reader.get_item("other_folder1/yahoo0.png")
assert not reader.get_entry_by_path("other_folder1/yahoo0.png").is_redirect
assert reader.get_item("other_folder2/yahoo3.png")
assert not reader.get_entry_by_path("other_folder2/yahoo3.png").is_redirect


def test_add_item_for(tmp_path):
fpath = tmp_path / "test.zim"
# test without mimetype
Expand Down

0 comments on commit a3ad5c5

Please sign in to comment.