Skip to content

Commit 6395928

Browse files
committed
Add capability to blacklist some websites and redirect them to library / Github issue
1 parent f77b192 commit 6395928

13 files changed

+511
-9
lines changed

api/src/zimitfrontend/constants.py

+6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import os
23
import pathlib
34
import random
@@ -16,6 +17,11 @@
1617
),
1718
)
1819

20+
blacklist = json.loads(
21+
(pathlib.Path(__file__).parent / "res/blacklist.json").read_bytes()
22+
)["blacklist"]
23+
logger.info(f"{len(blacklist)} websites are blacklisted")
24+
1925

2026
def _get_int_setting(environment_variable_name: str, default_value: int) -> int:
2127
"""Get environment variable as integer or fallback to default value"""
+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
{
2+
"blacklist": [
3+
{
4+
"host": "devdocs.io",
5+
"reason": "already_zimed",
6+
"libraryUrl": "https://library.kiwix.org/#tag=devdocs"
7+
},
8+
{
9+
"host": "gutenberg.org",
10+
"reason": "already_zimed",
11+
"libraryUrl": "https://library.kiwix.org/#tag=gutenberg"
12+
},
13+
{
14+
"host": "freecodecamp.org",
15+
"reason": "already_zimed",
16+
"libraryUrl": "https://library.kiwix.org/#tag=freecodecamp"
17+
},
18+
{
19+
"host": "ifixit.com",
20+
"reason": "already_zimed",
21+
"libraryUrl": "https://library.kiwix.org/#tag=ifixit"
22+
},
23+
{
24+
"host": "khanacademy.org",
25+
"reason": "already_zimed",
26+
"libraryUrl": "https://library.kiwix.org/#tag=khan-academy"
27+
},
28+
{
29+
"host": "africanstorybook.org",
30+
"reason": "already_zimed",
31+
"libraryUrl": "https://library.kiwix.org/#tag=&q=african+storybooks"
32+
},
33+
{
34+
"host": "libretexts.org",
35+
"reason": "already_zimed",
36+
"libraryUrl": "https://library.kiwix.org/#tag=&q=libretexts"
37+
},
38+
{
39+
"host": "phet.colorado.edu",
40+
"reason": "already_zimed",
41+
"libraryUrl": "https://library.kiwix.org/#tag=&q=phet"
42+
},
43+
{
44+
"host": "ted.com",
45+
"reason": "already_zimed",
46+
"libraryUrl": "https://library.kiwix.org/#tag=ted"
47+
},
48+
{ "host": "wikihow.com", "reason": "forbid_or_copyrighted_by_website_owner" },
49+
{
50+
"host": "youtube.com",
51+
"reason": "too_big_partially_already_zimed",
52+
"libraryUrl": "https://library.kiwix.org/#tag=youtube",
53+
"scraperUrl": "https://github.com/openzim/youtube"
54+
},
55+
{
56+
"host": "youtu.be",
57+
"reason": "too_big_partially_already_zimed",
58+
"libraryUrl": "https://library.kiwix.org/#tag=youtube",
59+
"scraperUrl": "https://github.com/openzim/youtube"
60+
},
61+
{
62+
"host": "stackexchange.com",
63+
"reason": "already_zimed",
64+
"libraryUrl": "https://library.kiwix.org/#tag=stack_exchange"
65+
},
66+
{
67+
"host": "stackoverflow.com",
68+
"reason": "already_zimed",
69+
"libraryUrl": "https://library.kiwix.org/#tag=stack_exchange"
70+
},
71+
{
72+
"host": "wikibooks.org",
73+
"reason": "already_zimed",
74+
"libraryUrl": "https://library.kiwix.org/#tag=wikibooks",
75+
"wp1Hint": true
76+
},
77+
{
78+
"host": "wikinews.org",
79+
"reason": "already_zimed",
80+
"libraryUrl": "https://library.kiwix.org/#tag=wikinews",
81+
"wp1Hint": true
82+
},
83+
{
84+
"host": "wikipedia.org",
85+
"reason": "already_zimed",
86+
"libraryUrl": "https://library.kiwix.org/#tag=wikipedia",
87+
"wp1Hint": true
88+
},
89+
{
90+
"host": "wikiquote.org",
91+
"reason": "already_zimed",
92+
"libraryUrl": "https://library.kiwix.org/#tag=wikiquote",
93+
"wp1Hint": true
94+
},
95+
{
96+
"host": "vikidia.org",
97+
"reason": "already_zimed",
98+
"libraryUrl": "https://library.kiwix.org/#tag=vikidia"
99+
},
100+
{
101+
"host": "wikisource.org",
102+
"reason": "already_zimed",
103+
"libraryUrl": "https://library.kiwix.org/#tag=wikisource",
104+
"wp1Hint": true
105+
},
106+
{
107+
"host": "wikiversity.org",
108+
"reason": "already_zimed",
109+
"libraryUrl": "https://library.kiwix.org/#tag=wikiversity",
110+
"wp1Hint": true
111+
},
112+
{
113+
"host": "wikivoyage.org",
114+
"reason": "already_zimed",
115+
"libraryUrl": "https://library.kiwix.org/#tag=wikivoyage",
116+
"wp1Hint": true
117+
},
118+
{
119+
"host": "wiktionary.org",
120+
"reason": "already_zimed",
121+
"libraryUrl": "https://library.kiwix.org/#tag=wiktionary",
122+
"wp1Hint": true
123+
},
124+
{
125+
"host": "reddit.com",
126+
"reason": "scraper_needed",
127+
"githubIssue": "https://github.com/openzim/zim-requests/issues/242"
128+
},
129+
{
130+
"host": "archive.org",
131+
"reason": "scraper_needed",
132+
"githubIssue": "https://github.com/openzim/zim-requests/issues/360"
133+
},
134+
{ "host": "quora.com", "reason": "forbid_or_copyrighted_by_website_owner" },
135+
{ "host": "download.kiwix.org", "reason": "not_possible_with_zimit" },
136+
{ "host": "google.com", "reason": "not_possible_with_zimit" },
137+
{ "host": "library.kiwix.org", "reason": "not_possible_with_zimit" },
138+
{ "host": "wikileaks.org", "reason": "not_possible_with_zimit" },
139+
{ "host": "minecraft.net", "reason": "not_possible_with_zimit" },
140+
{ "host": "github.com", "reason": "not_possible_with_zimit" },
141+
{ "host": "zimit.kiwix.org", "reason": "not_possible_with_zimit" },
142+
{ "host": "coursera.org", "reason": "not_possible_with_zimit" },
143+
{ "host": "facebook.com", "reason": "not_possible_with_zimit" }
144+
]
145+
}

api/src/zimitfrontend/routes/requests.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from fastapi import APIRouter, HTTPException, Path, Request
77

8-
from zimitfrontend.constants import ApiConfiguration, logger
8+
from zimitfrontend.constants import ApiConfiguration, blacklist, logger
99
from zimitfrontend.routes.schemas import (
1010
TaskCancelRequest,
1111
TaskCreateRequest,
@@ -86,6 +86,20 @@ def create_task(
8686

8787
url = urllib.parse.urlparse(request.url)
8888

89+
matching_blacklist_entries = [
90+
blacklist_entry
91+
for blacklist_entry in blacklist
92+
if blacklist_entry["host"].lower() in url.geturl().lower()
93+
]
94+
matching_blacklist_entry = (
95+
matching_blacklist_entries[0] if matching_blacklist_entries else None
96+
)
97+
if matching_blacklist_entry:
98+
raise HTTPException(
99+
HTTPStatus.BAD_REQUEST,
100+
detail={"error": "blacklisted", "blacklist": matching_blacklist_entry},
101+
)
102+
89103
# generate schedule name
90104
ident = str(uuid.uuid4())[:8]
91105
schedule_name = f"{url.hostname}_{ident}"

dev/zimit_ui_dev/config.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"kiwix_download_page": "https://kiwix.org/en/applications/",
88
"kiwix_contact_us": "https://kiwix.org/en/contact-us/",
99
"report_issues_page": "https://github.com/openzim/zimit/issues/",
10-
"home_page": "https://zimit.kiwix.org/",
10+
"home_page": "http://localhost:8001/",
1111
"zim_download_url": "https://s3.us-west-1.wasabisys.com/org-kiwix-zimit/zim",
1212
"new_request_advanced_flags": [
1313
"lang",

locales/en.json

+28
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
"creatingRequest": "Creating request…",
4343
"errorCreatingRequest": "Error creating request",
4444
"offlinerNotFound": "Zimit offliner not found, we probably experience a serious issue on our infrastructure.",
45+
"blacklistNotFound": "Blacklist not found.",
4546
"stopNewRequestsMessage": "Zimit temporarily does not accept new tasks for maintenance, will be back in few days."
4647
},
4748
"notFound": {
@@ -113,5 +114,32 @@
113114
"goToTask": "Go to ongoing task {taskLink}",
114115
"excessiveUsage": "We've detect excessive usage from your environment, please come back in few hours.",
115116
"abnormalUsage": "You are blocked for abnormal usage: {status}"
117+
},
118+
"blacklist": {
119+
"missingReason": "Blacklist reason must be set",
120+
"missingLibraryUrl": "Library URL must be set in blacklist reason",
121+
"missingGithubIssueUrl": "Github Issue URL must be set in blacklist reason",
122+
"goBack": "Go back",
123+
"contactUs": "Should you need a special ZIM, please contact us.",
124+
"alreadyZimed": {
125+
"alreadyMadeZim": "Kiwix has already made ZIM(s) of this website.",
126+
"downloadFromLibrary": " To save our resources, we invite you to download your ZIM from {link}.",
127+
"downloadFromLibraryLinkContent": "our library",
128+
"wp1Hint": "If you want to ZIM only few specific pages, you might be interested by {wp1Link}.",
129+
"wp1LinkContent": "our WP1 tool"
130+
},
131+
"tooBig": {
132+
"tooBigDetails": "This website is way too big to make it into a single ZIM.",
133+
"alreadyMadeZim": "Kiwix has already made few ZIM(s) of portions of this website.",
134+
"downloadOrRequest": "You can check these ZIMs in {libraryLink} or open an issue to {githubRequestLink} if it matches Kiwix purpose.",
135+
"libraryLinkContent": "our library",
136+
"githubRequestLinkContent": "request another ZIM",
137+
"useScraper": "Should you be a bit tech-savvy, you can also use {scraperRepoLink} on your own.",
138+
"scraperRepoLinkContent": "our dedicated scraper"
139+
},
140+
"copyrighted": "This website is protected by copyrights and/or Kiwix has been explicitely requested not to ZIM this website.",
141+
"notPossible": "It is unfortunately not possible to ZIM this website with zimit.",
142+
"scraperNeeded": "It seems possible to develop a custom scraper for this website, but we need your support on {githubIssueLink}.",
143+
"scraperNeededLinkContent": "this Github issue"
116144
}
117145
}

locales/qqq.json

+29-2
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,10 @@
4343
"advancedOptions": "This is the text displayed on a button to display advanced options.",
4444
"fetchingDefinitionAndStatus": "This is the message while fetching the task definition.",
4545
"errorFetchingDefinition": "This is the message when fetching the task definition failed.",
46-
"creatingRequest": "This is the message while creating a Zimfarm request.",
46+
"creatingRequest": "This is the message while creating a Zimfarm request.",
4747
"errorCreatingRequest": "This is the message when creating a Zimfarm request failed.",
4848
"offlinerNotFound": "This is the message when we failed to load offliner definition through API call.",
49-
"stopNewRequestsMessage": "This is the message when new requests can temporarily not be submitted anymore."
49+
"stopNewRequestsMessage": "This is the message when new requests can temporarily not be submitted anymore."
5050
},
5151
"notFound": {
5252
"heading": "This is the heading displayed when URL is not found/handled.",
@@ -117,5 +117,32 @@
117117
"goToTask": "This is the message inviting to open ongoing task",
118118
"excessiveUsage": "This is a more generic message about quota being reached",
119119
"abnormalUsage": "This is a generic error message when we fail to find quota information"
120+
},
121+
"blacklist": {
122+
"missingReason": "This is the error message when the blacklist reason is missconfigured (bug)",
123+
"missingLibraryUrl": "This is the error message when the library URL is missing in blacklist reason (bug)",
124+
"missingGithubIssueUrl": "This is the error message when the Github issue URL is missing in blacklist reason (bug)",
125+
"goBack": "This is the text on the go-back button",
126+
"contactUs": "This is the message inviting users to contact us",
127+
"alreadyZimed": {
128+
"alreadyMadeZim": "This is the message indicating that ZIM has already been built",
129+
"downloadFromLibrary": "This is the message indicating that ZIM can be downloaded",
130+
"downloadFromLibraryLinkContent": "This is the textual content of the download link",
131+
"wp1Hint": "This is a message inviting users to test our WP1 tool",
132+
"wp1LinkContent": "This is the textual content of the link to WP1"
133+
},
134+
"tooBig": {
135+
"tooBigDetails": "This is the message indicating that website is too big",
136+
"alreadyMadeZim": "This is the message indicating that few ZIMs have already been built",
137+
"downloadOrRequest": "This is the message inviting user to download ZIM or open a request",
138+
"libraryLinkContent": "This is the textual content of the download link",
139+
"githubRequestLinkContent": "This is the textual content of the Github request link",
140+
"useScraper": "This is the message inviting user to use our scraper",
141+
"scraperRepoLinkContent": "This is the textual content of the link to the scraper"
142+
},
143+
"copyrighted": "This is the message indicating that there is a copyright issue",
144+
"notPossible": "This is the message indicating that it is not possible to ZIM this website",
145+
"scraperNeeded": "This is the message indicating that we intend to build a dedicated scraper",
146+
"scraperNeededLinkContent": "This is the textual content of the link to the dedicated Github issue"
120147
}
121148
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
<script setup lang="ts">
2+
import { useMainStore } from '../stores/main'
3+
const mainStore = useMainStore()
4+
5+
const close = function () {
6+
mainStore.blacklistReason = undefined
7+
}
8+
</script>
9+
10+
<template>
11+
<div class="main">
12+
<div v-if="!mainStore.blacklistReason">{{ $t('blacklist.missingReason') }}</div>
13+
<div v-else-if="!mainStore.blacklistReason.libraryUrl">
14+
{{ $t('blacklist.missingLibraryUrl') }}
15+
</div>
16+
<div v-else>
17+
<p>{{ $t('blacklist.alreadyZimed.alreadyMadeZim') }}</p>
18+
<i18n-t keypath="blacklist.alreadyZimed.downloadFromLibrary" tag="p">
19+
<template #link>
20+
<a :href="mainStore.blacklistReason.libraryUrl" target="_blank">{{
21+
$t('blacklist.alreadyZimed.downloadFromLibraryLinkContent')
22+
}}</a>
23+
</template>
24+
</i18n-t>
25+
<i18n-t
26+
v-if="mainStore.blacklistReason.wp1Hint"
27+
keypath="blacklist.alreadyZimed.wp1Hint"
28+
tag="p"
29+
>
30+
<template #wp1Link>
31+
<a href="https://wp1.openzim.org/#/selections/simple" target="_blank">{{
32+
$t('blacklist.alreadyZimed.wp1LinkContent')
33+
}}</a>
34+
</template>
35+
</i18n-t>
36+
<p>{{ $t('blacklist.contactUs') }}</p>
37+
</div>
38+
<v-btn class="black" rounded="xl" @click="close">{{ $t('blacklist.goBack') }}</v-btn>
39+
</div>
40+
</template>
41+
42+
<style type="text/css" scoped>
43+
.v-btn {
44+
text-transform: none;
45+
background-color: transparent;
46+
}
47+
48+
.v-btn.black {
49+
background-color: black;
50+
color: white;
51+
}
52+
53+
p {
54+
margin-bottom: 1rem;
55+
}
56+
57+
.main {
58+
text-align: center;
59+
}
60+
</style>
+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
<script setup lang="ts">
2+
import { useMainStore } from '../stores/main'
3+
const mainStore = useMainStore()
4+
5+
const close = function () {
6+
mainStore.blacklistReason = undefined
7+
}
8+
</script>
9+
10+
<template>
11+
<div class="main">
12+
<div v-if="!mainStore.blacklistReason">{{ $t('blacklist.missingReason') }}</div>
13+
<div v-else>
14+
<p>{{ $t('blacklist.copyrighted') }}</p>
15+
</div>
16+
<v-btn class="black" rounded="xl" @click="close">{{ $t('blacklist.goBack') }}</v-btn>
17+
</div>
18+
</template>
19+
20+
<style type="text/css" scoped>
21+
.v-btn {
22+
text-transform: none;
23+
background-color: transparent;
24+
}
25+
26+
.v-btn.black {
27+
background-color: black;
28+
color: white;
29+
}
30+
31+
p {
32+
margin-bottom: 1rem;
33+
}
34+
35+
.main {
36+
text-align: center;
37+
}
38+
</style>

0 commit comments

Comments
 (0)