Skip to content

Commit

Permalink
Merge pull request #201 from openzim/lang
Browse files Browse the repository at this point in the history
crawler 0.10.2
  • Loading branch information
rgaudin authored Aug 2, 2023
2 parents 47ede96 + 722306d commit 7cb118e
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 23 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ jobs:
run: docker run -v $PWD/output:/output zimit zimit --url http://isago.rskg.org/ --name isago --zim-file isago.zim --adminEmail test@example.com --mobileDevice "Pixel 5" --statsFilename /output/stats.json --keep

- name: run integration test suite
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "pip install pytest; pytest -v ./integration.py"
run: docker run -v $PWD/test/integration.py:/app/integration.py -v $PWD/output:/output zimit bash -c "/app/zimit/bin/pip install pytest; /app/zimit/bin/pytest -v ./integration.py"
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `--title` to set ZIM title
- `--description` to set ZIM description
- New crawler options: `--maxPageLimit`, `--delay`, `--diskUtilization`
- `--zim-lang` param to set warc2zim's `--lang` (ISO-639-3)

### Changed

- Using browsertrix-crawler 0.10.0-beta.4
- Using browsertrix-crawler 0.10.2
- Default and accepted values for `--waitUntil` from crawler's update
- Using `main` warc2zim ⚠️ change before releasing!
- Disabled Chrome updates to prevent incidental inclusion of update data in WARC/ZIM (#172)
- `--failOnFailedSeed` used inconditionally
- `--lang` now passed to crawler (ISO-639-1)

### Removed

Expand Down
44 changes: 23 additions & 21 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,33 +1,35 @@
FROM webrecorder/browsertrix-crawler:0.10.0-beta.4
FROM webrecorder/browsertrix-crawler:0.10.2
LABEL org.opencontainers.image.source https://github.com/openzim/zimit

RUN apt-get update && apt-get install -qqy --no-install-recommends libmagic1 && apt-get clean && rm -rf /var/lib/apt/lists/*

# temp (needs warc2zim release on zimit release)
RUN pip3 install --no-cache-dir 'requests>=2.24.0' 'inotify==0.2.10' 'tld>=0.12,<0.13' 'warc2zim==1.5.1' && \
pip3 uninstall -y warc2zim && \
pip3 install 'git+https://github.com/openzim/warc2zim@main#egg_name=warc2zim'

RUN mkdir -p /output

WORKDIR /app

# download list of bad domains to filter-out. intentionnaly ran post-install
# so it's not cached in earlier layers (url stays same but content updated)
RUN mkdir -p /tmp/ads && cd /tmp/ads && \
RUN apt-get update \
&& apt-get install -qqy --no-install-recommends \
libmagic1 \
python3.10-venv \
&& rm -rf /var/lib/apt/lists/* \
# python setup (in venv not to conflict with browsertrix)
&& python3 -m venv /app/zimit \
&& /app/zimit/bin/python -m pip install --no-cache-dir 'requests==2.31.0' 'inotify==0.2.10' 'tld==0.13' 'warc2zim==1.5.2' \
# placeholder (default output location)
&& mkdir -p /output \
# disable chrome upgrade
&& printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome \
# download list of bad domains to filter-out. intentionnaly ran post-install \
# so it's not cached in earlier layers (url stays same but content updated) \
mkdir -p /tmp/ads && cd /tmp/ads && \
curl -L -O https://hosts.anudeep.me/mirror/adservers.txt && \
curl -L -O https://hosts.anudeep.me/mirror/CoinMiner.txt && \
curl -L -O https://hosts.anudeep.me/mirror/facebook.txt && \
cat ./*.txt > /etc/blocklist.txt \
&& rm ./*.txt
RUN printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
&& rm ./*.txt \
&& printf '#!/bin/sh\ncat /etc/blocklist.txt >> /etc/hosts\nexec "$@"' > /usr/local/bin/entrypoint.sh && \
chmod +x /usr/local/bin/entrypoint.sh

WORKDIR /app
ADD zimit.py /app/

RUN ln -s /app/zimit.py /usr/bin/zimit

RUN printf "repo_add_once=\"false\"\nrepo_reenable_on_distupgrade=\"false\"\n" > /etc/default/google-chrome
# fix shebang on zimit to use in-venv python
RUN sed -i.bak "1 s/.*/#!\/app\/zimit\/bin\/python3/" /app/zimit.py \
&& ln -s /app/zimit.py /usr/bin/zimit \
&& chmod +x /usr/bin/zimit

ENTRYPOINT ["entrypoint.sh"]
CMD ["zimit"]
17 changes: 17 additions & 0 deletions zimit.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,18 @@ def zimit(args=None):
action="store_true",
)

parser.add_argument(
"--lang",
help="if set, sets the language used by the browser, should be ISO 639 language[-country] code",
)

parser.add_argument(
"--zim-lang",
help="Language metadata of ZIM "
"(warc2zim --lang param). ISO-639-3 code. "
"Retrieved from homepage if found, fallback to `eng`",
)

parser.add_argument(
"--mobileDevice",
help="Emulate mobile device by name from "
Expand Down Expand Up @@ -348,6 +360,10 @@ def zimit(args=None):
warc2zim_args.append("--description")
warc2zim_args.append(zimit_args.description)

if zimit_args.zim_lang:
warc2zim_args.append("--lang")
warc2zim_args.append(zimit_args.zim_lang)

print("----------")
print("Testing warc2zim args")
print("Running: warc2zim " + " ".join(warc2zim_args), flush=True)
Expand Down Expand Up @@ -482,6 +498,7 @@ def get_node_cmd_line(args):
"exclude",
"collection",
"allowHashUrls",
"lang",
"mobileDevice",
"userAgent",
"useSitemap",
Expand Down

0 comments on commit 7cb118e

Please sign in to comment.