Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add capability to blacklist some websites and redirect them to library / Github issue #124

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions api/src/zimitfrontend/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import pathlib
import random
Expand All @@ -16,6 +17,11 @@
),
)

blacklist = json.loads(
(pathlib.Path(__file__).parent / "res/blacklist.json").read_bytes()
)["blacklist"]
logger.info(f"{len(blacklist)} websites are blacklisted")


def _get_int_setting(environment_variable_name: str, default_value: int) -> int:
"""Get environment variable as integer or fallback to default value"""
Expand Down
145 changes: 145 additions & 0 deletions api/src/zimitfrontend/res/blacklist.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
{
"blacklist": [
{
"host": "devdocs.io",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=devdocs"
},
{
"host": "gutenberg.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=gutenberg"
},
{
"host": "freecodecamp.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=freecodecamp"
},
{
"host": "ifixit.com",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=ifixit"
},
{
"host": "khanacademy.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=khan-academy"
},
{
"host": "africanstorybook.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=&q=african+storybooks"
},
{
"host": "libretexts.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=&q=libretexts"
},
{
"host": "phet.colorado.edu",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=&q=phet"
},
{
"host": "ted.com",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=ted"
},
{ "host": "wikihow.com", "reason": "forbid_or_copyrighted_by_website_owner" },
{
"host": "youtube.com",
"reason": "too_big_partially_already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=youtube",
"scraperUrl": "https://github.com/openzim/youtube"
},
{
"host": "youtu.be",
"reason": "too_big_partially_already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=youtube",
"scraperUrl": "https://github.com/openzim/youtube"
},
{
"host": "stackexchange.com",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=stack_exchange"
},
{
"host": "stackoverflow.com",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=stack_exchange"
},
{
"host": "wikibooks.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=wikibooks",
"wp1Hint": true
},
{
"host": "wikinews.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=wikinews",
"wp1Hint": true
},
{
"host": "wikipedia.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=wikipedia",
"wp1Hint": true
},
{
"host": "wikiquote.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=wikiquote",
"wp1Hint": true
},
{
"host": "vikidia.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=vikidia"
},
{
"host": "wikisource.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=wikisource",
"wp1Hint": true
},
{
"host": "wikiversity.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=wikiversity",
"wp1Hint": true
},
{
"host": "wikivoyage.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=wikivoyage",
"wp1Hint": true
},
{
"host": "wiktionary.org",
"reason": "already_zimed",
"libraryUrl": "https://library.kiwix.org/#tag=wiktionary",
"wp1Hint": true
},
{
"host": "reddit.com",
"reason": "scraper_needed",
"githubIssue": "https://github.com/openzim/zim-requests/issues/242"
},
{
"host": "archive.org",
"reason": "scraper_needed",
"githubIssue": "https://github.com/openzim/zim-requests/issues/360"
},
{ "host": "quora.com", "reason": "forbid_or_copyrighted_by_website_owner" },
{ "host": "download.kiwix.org", "reason": "not_possible_with_zimit" },
{ "host": "google.com", "reason": "not_possible_with_zimit" },
{ "host": "library.kiwix.org", "reason": "not_possible_with_zimit" },
{ "host": "wikileaks.org", "reason": "not_possible_with_zimit" },
{ "host": "minecraft.net", "reason": "not_possible_with_zimit" },
{ "host": "github.com", "reason": "not_possible_with_zimit" },
{ "host": "zimit.kiwix.org", "reason": "not_possible_with_zimit" },
{ "host": "coursera.org", "reason": "not_possible_with_zimit" },
{ "host": "facebook.com", "reason": "not_possible_with_zimit" }
]
}
16 changes: 15 additions & 1 deletion api/src/zimitfrontend/routes/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from fastapi import APIRouter, HTTPException, Path, Request

from zimitfrontend.constants import ApiConfiguration, logger
from zimitfrontend.constants import ApiConfiguration, blacklist, logger

Check warning on line 8 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L8

Added line #L8 was not covered by tests
from zimitfrontend.routes.schemas import (
TaskCancelRequest,
TaskCreateRequest,
Expand Down Expand Up @@ -59,33 +59,47 @@
},
},
)
def create_task(
request: TaskCreateRequest, http_request: Request
) -> TaskCreateResponse:

if not http_request.client:
raise HTTPException(
HTTPStatus.INTERNAL_SERVER_ERROR, detail="http_request.client is missing"
)

# check that client can start a task
add_task = tracker.add_task(
http_request.client.host,
request.unique_id,
None,
)
if add_task.status != AddTaskStatus.CAN_ADD_TASK:
raise HTTPException(
status_code=HTTPStatus.TOO_MANY_REQUESTS,
detail={
"message": "Too many requests already ongoing for your user",
"reason": add_task.status.value,
"ongoing_tasks": add_task.ongoing_tasks,
},
)

url = urllib.parse.urlparse(request.url)

matching_blacklist_entries = [

Check warning on line 89 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L89

Added line #L89 was not covered by tests
blacklist_entry
for blacklist_entry in blacklist
if blacklist_entry["host"].lower() in url.geturl().lower()
]
matching_blacklist_entry = (

Check warning on line 94 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L94

Added line #L94 was not covered by tests
matching_blacklist_entries[0] if matching_blacklist_entries else None
)
if matching_blacklist_entry:
raise HTTPException(

Check warning on line 98 in api/src/zimitfrontend/routes/requests.py

View check run for this annotation

Codecov / codecov/patch

api/src/zimitfrontend/routes/requests.py#L98

Added line #L98 was not covered by tests
HTTPStatus.BAD_REQUEST,
detail={"error": "blacklisted", "blacklist": matching_blacklist_entry},
)

# generate schedule name
ident = str(uuid.uuid4())[:8]
schedule_name = f"{url.hostname}_{ident}"
Expand Down
2 changes: 1 addition & 1 deletion dev/zimit_ui_dev/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"kiwix_download_page": "https://kiwix.org/en/applications/",
"kiwix_contact_us": "https://kiwix.org/en/contact-us/",
"report_issues_page": "https://github.com/openzim/zimit/issues/",
"home_page": "https://zimit.kiwix.org/",
"home_page": "http://localhost:8001/",
"zim_download_url": "https://s3.us-west-1.wasabisys.com/org-kiwix-zimit/zim",
"new_request_advanced_flags": [
"lang",
Expand Down
27 changes: 27 additions & 0 deletions locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,5 +113,32 @@
"goToTask": "Go to ongoing task {taskLink}",
"excessiveUsage": "We've detect excessive usage from your environment, please come back in few hours.",
"abnormalUsage": "You are blocked for abnormal usage: {status}"
},
"blacklist": {
"missingReason": "Blacklist reason must be set",
"missingLibraryUrl": "Library URL must be set in blacklist reason",
"missingGithubIssueUrl": "Github Issue URL must be set in blacklist reason",
"goBack": "Go back",
"contactUs": "Should you need a special ZIM, please contact us.",
"alreadyZimed": {
"alreadyMadeZim": "Kiwix has already made ZIM(s) of this website.",
"downloadFromLibrary": " To save our resources, we invite you to download your ZIM from {link}.",
"downloadFromLibraryLinkContent": "our library",
"wp1Hint": "If you want to ZIM only few specific pages, you might be interested by {wp1Link}.",
"wp1LinkContent": "our WP1 tool"
},
"tooBig": {
"tooBigDetails": "This website is way too big to make it into a single ZIM.",
"alreadyMadeZim": "Kiwix has already made few ZIM(s) of portions of this website.",
"downloadOrRequest": "You can check these ZIMs in {libraryLink} or open an issue to {githubRequestLink} if it matches Kiwix purpose.",
"libraryLinkContent": "our library",
"githubRequestLinkContent": "request another ZIM",
"useScraper": "Should you be a bit tech-savvy, you can also use {scraperRepoLink} on your own.",
"scraperRepoLinkContent": "our dedicated scraper"
},
"copyrighted": "This website is protected by copyrights and/or Kiwix has been explicitely requested not to ZIM this website.",
"notPossible": "It is unfortunately not possible to ZIM this website with zimit.",
"scraperNeeded": "It seems possible to develop a custom scraper for this website, but we need your support on {githubIssueLink}.",
"scraperNeededLinkContent": "this Github issue"
}
}
31 changes: 29 additions & 2 deletions locales/qqq.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,10 @@
"advancedOptions": "This is the text displayed on a button to display advanced options.",
"fetchingDefinitionAndStatus": "This is the message while fetching the task definition.",
"errorFetchingDefinition": "This is the message when fetching the task definition failed.",
"creatingRequest": "This is the message while creating a Zimfarm request.",
"creatingRequest": "This is the message while creating a Zimfarm request.",
"errorCreatingRequest": "This is the message when creating a Zimfarm request failed.",
"offlinerNotFound": "This is the message when we failed to load offliner definition through API call.",
"stopNewRequestsMessage": "This is the message when new requests can temporarily not be submitted anymore."
"stopNewRequestsMessage": "This is the message when new requests can temporarily not be submitted anymore."
},
"notFound": {
"heading": "This is the heading displayed when URL is not found/handled.",
Expand Down Expand Up @@ -117,5 +117,32 @@
"goToTask": "This is the message inviting to open ongoing task",
"excessiveUsage": "This is a more generic message about quota being reached",
"abnormalUsage": "This is a generic error message when we fail to find quota information"
},
"blacklist": {
"missingReason": "This is the error message when the blacklist reason is missconfigured (bug)",
"missingLibraryUrl": "This is the error message when the library URL is missing in blacklist reason (bug)",
"missingGithubIssueUrl": "This is the error message when the Github issue URL is missing in blacklist reason (bug)",
"goBack": "This is the text on the go-back button",
"contactUs": "This is the message inviting users to contact us",
"alreadyZimed": {
"alreadyMadeZim": "This is the message indicating that ZIM has already been built",
"downloadFromLibrary": "This is the message indicating that ZIM can be downloaded",
"downloadFromLibraryLinkContent": "This is the textual content of the download link",
"wp1Hint": "This is a message inviting users to test our WP1 tool",
"wp1LinkContent": "This is the textual content of the link to WP1"
},
"tooBig": {
"tooBigDetails": "This is the message indicating that website is too big",
"alreadyMadeZim": "This is the message indicating that few ZIMs have already been built",
"downloadOrRequest": "This is the message inviting user to download ZIM or open a request",
"libraryLinkContent": "This is the textual content of the download link",
"githubRequestLinkContent": "This is the textual content of the Github request link",
"useScraper": "This is the message inviting user to use our scraper",
"scraperRepoLinkContent": "This is the textual content of the link to the scraper"
},
"copyrighted": "This is the message indicating that there is a copyright issue",
"notPossible": "This is the message indicating that it is not possible to ZIM this website",
"scraperNeeded": "This is the message indicating that we intend to build a dedicated scraper",
"scraperNeededLinkContent": "This is the textual content of the link to the dedicated Github issue"
}
}
60 changes: 60 additions & 0 deletions ui/src/components/BlacklistAlreadyZimed.vue
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
<script setup lang="ts">
import { useMainStore } from '../stores/main'
const mainStore = useMainStore()

const close = function () {
mainStore.blacklistReason = undefined
}
</script>

<template>
<div class="main">
<div v-if="!mainStore.blacklistReason">{{ $t('blacklist.missingReason') }}</div>
<div v-else-if="!mainStore.blacklistReason.libraryUrl">
{{ $t('blacklist.missingLibraryUrl') }}
</div>
<div v-else>
<p>{{ $t('blacklist.alreadyZimed.alreadyMadeZim') }}</p>
<i18n-t keypath="blacklist.alreadyZimed.downloadFromLibrary" tag="p">
<template #link>
<a :href="mainStore.blacklistReason.libraryUrl" target="_blank">{{
$t('blacklist.alreadyZimed.downloadFromLibraryLinkContent')
}}</a>
</template>
</i18n-t>
<i18n-t
v-if="mainStore.blacklistReason.wp1Hint"
keypath="blacklist.alreadyZimed.wp1Hint"
tag="p"
>
<template #wp1Link>
<a href="https://wp1.openzim.org/#/selections/simple" target="_blank">{{
$t('blacklist.alreadyZimed.wp1LinkContent')
}}</a>
</template>
</i18n-t>
<p>{{ $t('blacklist.contactUs') }}</p>
</div>
<v-btn class="black" rounded="xl" @click="close">{{ $t('blacklist.goBack') }}</v-btn>
</div>
</template>

<style type="text/css" scoped>
.v-btn {
text-transform: none;
background-color: transparent;
}

.v-btn.black {
background-color: black;
color: white;
}

p {
margin-bottom: 1rem;
}

.main {
text-align: center;
}
</style>
38 changes: 38 additions & 0 deletions ui/src/components/BlacklistCopyright.vue
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<script setup lang="ts">
import { useMainStore } from '../stores/main'
const mainStore = useMainStore()

const close = function () {
mainStore.blacklistReason = undefined
}
</script>

<template>
<div class="main">
<div v-if="!mainStore.blacklistReason">{{ $t('blacklist.missingReason') }}</div>
<div v-else>
<p>{{ $t('blacklist.copyrighted') }}</p>
</div>
<v-btn class="black" rounded="xl" @click="close">{{ $t('blacklist.goBack') }}</v-btn>
</div>
</template>

<style type="text/css" scoped>
.v-btn {
text-transform: none;
background-color: transparent;
}

.v-btn.black {
background-color: black;
color: white;
}

p {
margin-bottom: 1rem;
}

.main {
text-align: center;
}
</style>
Loading