From efaccfed328bb11436a4c636eb501f82d8903c8a Mon Sep 17 00:00:00 2001 From: renaud gaudin Date: Tue, 6 Dec 2022 15:01:25 +0000 Subject: [PATCH] Using pylibzim v2.1.0 (using libzim 8.1.0) --- CHANGELOG.md | 17 +++++++++++++++- requirements.txt | 2 +- src/zimscraperlib/zim/_libkiwix.py | 24 ---------------------- src/zimscraperlib/zim/archive.py | 32 ++++++++++++------------------ tests/zim/test_libkiwix.py | 12 +---------- tests/zim/test_zim_creator.py | 4 ++-- 6 files changed, 33 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ce1710a7..9f038c6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -- n/a +- Using pylibzim v2.1.0 (using libzim 8.1.0) + +### Added + +- [libzim] `Entry.get_redirect_entry()` +- [libzim] `Item.get_indexdata()` to implement custom IndexData per entry (writer) +- [libzim] `Archive.media_count` + +### Changed +- [libzim] `Archive.article_count` updated to match scraperlib's version +- `Archive.article_counter` now deprecated. Now returns `Archive.article_count` +- `Archive.media_counter` now deprecated. Now returns `Archive.media_count` + +### Removed + +- [libzim] `lzma` compression algorithm ## [1.8.0] - 2022-08-05 diff --git a/requirements.txt b/requirements.txt index 5e3c704f..05691024 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ colorthief==0.2.1 python-resize-image>=1.1.19,<1.2 Babel>=2.9,<3.0 file-magic>=0.4.0,<0.5 -libzim>=1.1.0,<1.2 +libzim>=2.1.0,<3.0 beautifulsoup4>=4.9.3,<4.10 lxml>=4.6.3,<4.10 optimize-images>=1.3.6,<1.6 diff --git a/src/zimscraperlib/zim/_libkiwix.py b/src/zimscraperlib/zim/_libkiwix.py index 2971f209..d71db702 100644 --- a/src/zimscraperlib/zim/_libkiwix.py +++ b/src/zimscraperlib/zim/_libkiwix.py @@ -84,30 +84,6 @@ def parseMimetypeCounter( return counters -def getArticleCount(counterMap: CounterMap): - """Get the count of articles which can be indexed/displayed""" - counter = 0 - for mimetype, count in counterMap.items(): - if mimetype.startswith("text/html"): - counter += count - - return counter - - -def getMediaCount(counterMap: CounterMap) -> int: - """Get the count of medias content in the ZIM file""" - counter = 0 - for mimetype, count in counterMap.items(): - if ( - mimetype.startswith("image/") - or mimetype.startswith("video/") - or mimetype.startswith("audio/") - ): - counter += count - - return counter - - def convertTags(tags_str: str) -> List[str]: """List of tags expanded with libkiwix's additional hints for pic/vid/det/index""" tags = tags_str.split(";") diff --git a/src/zimscraperlib/zim/archive.py b/src/zimscraperlib/zim/archive.py index 5444c37a..87da077c 100644 --- a/src/zimscraperlib/zim/archive.py +++ b/src/zimscraperlib/zim/archive.py @@ -10,13 +10,14 @@ - direct access to search results and number of results - public Entry access by Id""" +import warnings from typing import Dict, Iterable, List, Optional import libzim.reader import libzim.search # Query, Searcher import libzim.suggestion # SuggestionSearcher -from ._libkiwix import convertTags, getArticleCount, getMediaCount, parseMimetypeCounter +from ._libkiwix import convertTags, parseMimetypeCounter from .items import Item @@ -109,24 +110,17 @@ def counters(self) -> Dict[str, int]: @property def article_counter(self) -> int: - """Nb of *articles* in the ZIM, using counters (from libkiwix)""" - - # [libkiwix HACK] - # getArticleCount() returns different things depending on - # the "version" of the zim. - # On old zim (<=6), it returns the number of entry in `A` namespace - # On recent zim (>=7), it returns: - # - the number of entry in `C` namespace (==getEntryCount) - # if no frontArticleIndex is present - # - the number of front article if a frontArticleIndex is present - # The use case >=7 without frontArticleIndex is pretty rare so we don't care - # We can detect if we are reading a zim <= 6 - # by checking if we have a newNamespaceScheme. - if self.has_new_namespace_scheme: - return self.article_count - return getArticleCount(self.counters) + warnings.warn( + "Archive.article_counter now deprecated. " + "Use Archive.article_count instead", + DeprecationWarning, + ) + return self.article_count @property def media_counter(self) -> int: - """Nb of *medias* in the ZIM, using counters (from libkiwix)""" - return getMediaCount(self.counters) + warnings.warn( + "Archive.media_counter now deprecated. " "Use Archive.media_count instead", + DeprecationWarning, + ) + return self.media_count diff --git a/tests/zim/test_libkiwix.py b/tests/zim/test_libkiwix.py index c81ed0c2..aa3c29a7 100644 --- a/tests/zim/test_libkiwix.py +++ b/tests/zim/test_libkiwix.py @@ -6,7 +6,7 @@ import pytest -from zimscraperlib.zim._libkiwix import getArticleCount, getline, getMediaCount +from zimscraperlib.zim._libkiwix import getline from zimscraperlib.zim._libkiwix import parseMimetypeCounter as parse empty = {} @@ -81,13 +81,3 @@ def test_getline(): def test_counter_parsing(counterStr, counterMap): # https://github.com/kiwix/libkiwix/blob/master/test/counterParsing.cpp assert parse(counterStr) == counterMap - - -def test_article_count(counters): - assert getArticleCount({}) == 0 - assert getArticleCount(counters) == 6339 - - -def test_media_count(counters): - assert getMediaCount({}) == 0 - assert getMediaCount(counters) == 12862 diff --git a/tests/zim/test_zim_creator.py b/tests/zim/test_zim_creator.py index 496febd4..001e8efc 100644 --- a/tests/zim/test_zim_creator.py +++ b/tests/zim/test_zim_creator.py @@ -179,10 +179,10 @@ def remove_source(item): def test_compression(tmp_path): fpath = tmp_path / "test.zim" - with Creator(tmp_path / "test.zim", "welcome", "", compression="lzma") as creator: + with Creator(tmp_path / "test.zim", "welcome", "", compression="zstd") as creator: creator.add_item(StaticItem(path="welcome", content="hello")) - with Creator(fpath, "welcome", "", compression=Compression.lzma) as creator: + with Creator(fpath, "welcome", "", compression=Compression.zstd) as creator: creator.add_item(StaticItem(path="welcome", content="hello"))