Skip to content

Commit

Permalink
Using pylibzim v2.1.0 (using libzim 8.1.0)
Browse files Browse the repository at this point in the history
  • Loading branch information
rgaudin committed Dec 6, 2022
1 parent 00d7fb0 commit efaccfe
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 58 deletions.
17 changes: 16 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

- n/a
- Using pylibzim v2.1.0 (using libzim 8.1.0)

### Added

- [libzim] `Entry.get_redirect_entry()`
- [libzim] `Item.get_indexdata()` to implement custom IndexData per entry (writer)
- [libzim] `Archive.media_count`

### Changed
- [libzim] `Archive.article_count` updated to match scraperlib's version
- `Archive.article_counter` now deprecated. Now returns `Archive.article_count`
- `Archive.media_counter` now deprecated. Now returns `Archive.media_count`

### Removed

- [libzim] `lzma` compression algorithm

## [1.8.0] - 2022-08-05

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ colorthief==0.2.1
python-resize-image>=1.1.19,<1.2
Babel>=2.9,<3.0
file-magic>=0.4.0,<0.5
libzim>=1.1.0,<1.2
libzim>=2.1.0,<3.0
beautifulsoup4>=4.9.3,<4.10
lxml>=4.6.3,<4.10
optimize-images>=1.3.6,<1.6
Expand Down
24 changes: 0 additions & 24 deletions src/zimscraperlib/zim/_libkiwix.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,30 +84,6 @@ def parseMimetypeCounter(
return counters


def getArticleCount(counterMap: CounterMap):
"""Get the count of articles which can be indexed/displayed"""
counter = 0
for mimetype, count in counterMap.items():
if mimetype.startswith("text/html"):
counter += count

return counter


def getMediaCount(counterMap: CounterMap) -> int:
"""Get the count of medias content in the ZIM file"""
counter = 0
for mimetype, count in counterMap.items():
if (
mimetype.startswith("image/")
or mimetype.startswith("video/")
or mimetype.startswith("audio/")
):
counter += count

return counter


def convertTags(tags_str: str) -> List[str]:
"""List of tags expanded with libkiwix's additional hints for pic/vid/det/index"""
tags = tags_str.split(";")
Expand Down
32 changes: 13 additions & 19 deletions src/zimscraperlib/zim/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
- direct access to search results and number of results
- public Entry access by Id"""

import warnings
from typing import Dict, Iterable, List, Optional

import libzim.reader
import libzim.search # Query, Searcher
import libzim.suggestion # SuggestionSearcher

from ._libkiwix import convertTags, getArticleCount, getMediaCount, parseMimetypeCounter
from ._libkiwix import convertTags, parseMimetypeCounter
from .items import Item


Expand Down Expand Up @@ -109,24 +110,17 @@ def counters(self) -> Dict[str, int]:

@property
def article_counter(self) -> int:
"""Nb of *articles* in the ZIM, using counters (from libkiwix)"""

# [libkiwix HACK]
# getArticleCount() returns different things depending on
# the "version" of the zim.
# On old zim (<=6), it returns the number of entry in `A` namespace
# On recent zim (>=7), it returns:
# - the number of entry in `C` namespace (==getEntryCount)
# if no frontArticleIndex is present
# - the number of front article if a frontArticleIndex is present
# The use case >=7 without frontArticleIndex is pretty rare so we don't care
# We can detect if we are reading a zim <= 6
# by checking if we have a newNamespaceScheme.
if self.has_new_namespace_scheme:
return self.article_count
return getArticleCount(self.counters)
warnings.warn(
"Archive.article_counter now deprecated. "
"Use Archive.article_count instead",
DeprecationWarning,
)
return self.article_count

@property
def media_counter(self) -> int:
"""Nb of *medias* in the ZIM, using counters (from libkiwix)"""
return getMediaCount(self.counters)
warnings.warn(
"Archive.media_counter now deprecated. " "Use Archive.media_count instead",
DeprecationWarning,
)
return self.media_count
12 changes: 1 addition & 11 deletions tests/zim/test_libkiwix.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pytest

from zimscraperlib.zim._libkiwix import getArticleCount, getline, getMediaCount
from zimscraperlib.zim._libkiwix import getline
from zimscraperlib.zim._libkiwix import parseMimetypeCounter as parse

empty = {}
Expand Down Expand Up @@ -81,13 +81,3 @@ def test_getline():
def test_counter_parsing(counterStr, counterMap):
# https://github.com/kiwix/libkiwix/blob/master/test/counterParsing.cpp
assert parse(counterStr) == counterMap


def test_article_count(counters):
assert getArticleCount({}) == 0
assert getArticleCount(counters) == 6339


def test_media_count(counters):
assert getMediaCount({}) == 0
assert getMediaCount(counters) == 12862
4 changes: 2 additions & 2 deletions tests/zim/test_zim_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,10 @@ def remove_source(item):

def test_compression(tmp_path):
fpath = tmp_path / "test.zim"
with Creator(tmp_path / "test.zim", "welcome", "", compression="lzma") as creator:
with Creator(tmp_path / "test.zim", "welcome", "", compression="zstd") as creator:
creator.add_item(StaticItem(path="welcome", content="hello"))

with Creator(fpath, "welcome", "", compression=Compression.lzma) as creator:
with Creator(fpath, "welcome", "", compression=Compression.zstd) as creator:
creator.add_item(StaticItem(path="welcome", content="hello"))


Expand Down

0 comments on commit efaccfe

Please sign in to comment.