diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 57cd1a4..8083581 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -17,4 +17,4 @@ jobs: run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep - name: run integration test suite - run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py" + run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py" diff --git a/CHANGELOG.md b/CHANGELOG.md index b556524..497d0cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,14 +12,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `--title` to set ZIM title - `--description` to set ZIM description - New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization` +- `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3) ### Changed -- Using browsertrix-crawler 0.10.0-beta.4 +- Using browsertrix-crawler 0.10.2 - Default and accepted values for `--waitUntil` from crawler's update - Using `main` warc2zim ⚠️ change before releasing! - Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172) - `--failOnFailedSeed` used inconditionally +- `--lang` now passed to crawler (ISO-639-1) ### Removed diff --git a/Dockerfile b/Dockerfile index a636ccb..b5ae475 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,33 +1,35 @@ - FROM webrecorder/browsertrix-crawler:0.10.0-beta.4 +FROM webrecorder/browsertrix-crawler:0.10.2 LABEL org.opencontainers.image.source https://github.com/openzim/zimit -RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/* - -# temp (needs warc2zim release on zimit release) -RUN pip3 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.5.1' && \ - pip3 uninstall -y warc2zim && \ - pip3 install 'git+https://github.com/openzim/warc2zim@main#egg_name=warc2zim' - -RUN mkdir -p /output - -WORKDIR /app - -# download list of bad domains to filter-out. intentionnaly ran post-install -# so it's not cached in earlier layers (url stays same but content updated) -RUN mkdir -p /tmp/ads && cd /tmp/ads && \ +RUN apt-get update \ + && apt-get install -qqy --no-install-recommends \ + libmagic1 \ + python3.10-venv \ + && rm -rf /var/lib/apt/lists/* \ + # python setup (in venv not to conflict with browsertrix) + && python3 -m venv /app/zimit \ + && /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' 'warc2zim==1.5.2' \ + # placeholder (default output location) + && mkdir -p /output \ + # disable chrome upgrade + && printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \ + # download list of bad domains to filter-out. intentionnaly ran post-install \ + # so it's not cached in earlier layers (url stays same but content updated) \ + mkdir -p /tmp/ads && cd /tmp/ads && \ curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \ curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \ curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \ cat ./*.txt > /etc/blocklist.txt \ - && rm ./*.txt -RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \ + && rm ./*.txt \ + && printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \ chmod +x /usr/local/bin/entrypoint.sh +WORKDIR /app ADD zimit.py /app/ - -RUN ln -s /app/zimit.py /usr/bin/zimit - -RUN printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome +# fix shebang on zimit to use in-venv python +RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \ + && ln -s /app/zimit.py /usr/bin/zimit \ + && chmod +x /usr/bin/zimit ENTRYPOINT ["entrypoint.sh"] CMD ["zimit"] diff --git a/zimit.py b/zimit.py index 7fbc6e3..3fd8032 100755 --- a/zimit.py +++ b/zimit.py @@ -205,6 +205,18 @@ def zimit(args=None): action="store_true", ) + parser.add_argument( + "--lang", + help="if set, sets the language used by the browser, should be ISO 639 language[-country] code", + ) + + parser.add_argument( + "--zim-lang", + help="Language metadata of ZIM " + "(warc2zim --lang param). ISO-639-3 code. " + "Retrieved from homepage if found, fallback to `eng`", + ) + parser.add_argument( "--mobileDevice", help="Emulate mobile device by name from " @@ -348,6 +360,10 @@ def zimit(args=None): warc2zim_args.append("--description") warc2zim_args.append(zimit_args.description) + if zimit_args.zim_lang: + warc2zim_args.append("--lang") + warc2zim_args.append(zimit_args.zim_lang) + print("----------") print("Testing warc2zim args") print("Running: warc2zim " + " ".join(warc2zim_args), flush=True) @@ -482,6 +498,7 @@ def get_node_cmd_line(args): "exclude", "collection", "allowHashUrls", + "lang", "mobileDevice", "userAgent", "useSitemap",