From 4465a3ea684c0e1f7e77b1558945d62c64b769de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 27 May 2019 22:24:48 +0200 Subject: [PATCH] [kissmanga][readcomiconline] add 'captcha' option (#279) to configure how to handle CAPTCHA page redirects: - either interactively wait for the user to solve the CAPTCHA - or raise StopExtraction like before --- docs/configuration.rst | 24 +++++++++++++++++ docs/gallery-dl.conf | 8 ++++++ gallery_dl/extractor/kissmanga.py | 36 +++++++++++++++++-------- gallery_dl/extractor/readcomiconline.py | 14 +++------- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index cc4252111f..0131663ae2 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -593,6 +593,18 @@ Description Controls whether to choose the GIF or MP4 version of an animation. =========== ===== +extractor.kissmanga.captcha +--------------------------- +=========== ===== +Type ``string`` +Default ``"stop"`` +Description Controls how to handle redirects to CAPTCHA pages. + + * ``"stop``: Stop the current extractor run. + * ``"wait``: Ask the user to solve the CAPTCHA and wait. +=========== ===== + + extractor.oauth.browser ----------------------- =========== ===== @@ -646,6 +658,18 @@ Description Minimum and maximum wait time in seconds between HTTP requests =========== ===== +extractor.readcomiconline.captcha +--------------------------------- +=========== ===== +Type ``string`` +Default ``"stop"`` +Description Controls how to handle redirects to CAPTCHA pages. + + * ``"stop``: Stop the current extractor run. + * ``"wait``: Ask the user to solve the CAPTCHA and wait. +=========== ===== + + extractor.recursive.blacklist ----------------------------- =========== ===== diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 2dfd5df55d..0891cf5fc7 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -62,6 +62,10 @@ { "mp4": true }, + "kissmanga": + { + "captcha": "stop" + }, "nijie": { "username": null, @@ -82,6 +86,10 @@ "wait-min": 3.0, "wait-max": 6.0 }, + "readcomiconline": + { + "captcha": "stop" + }, "recursive": { "blacklist": ["directlink", "oauth", "recursive", "test"] diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index 519dc9731b..19e0a26a63 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -8,7 +8,7 @@ """Extract manga-chapters and entire manga from https://kissmanga.com/""" -from .common import ChapterExtractor, MangaExtractor +from .common import ChapterExtractor, MangaExtractor, Extractor from .. import text, aes, exception from ..cache import cache import hashlib @@ -16,21 +16,35 @@ import re -class KissmangaBase(): +class RedirectMixin(): + """Detect and handle redirects to CAPTCHA pages""" + + def request(self, url): + while True: + response = Extractor.request(self, url) + if not response.history or "/AreYouHuman" not in response.url: + return response + if self.config("captcha", "stop") == "wait": + self.log.warning( + "Redirect to \n%s\nVisit this URL in your browser, solve " + "the CAPTCHA, and press ENTER to continue", response.url) + try: + input() + except (EOFError, OSError): + pass + else: + self.log.error( + "Redirect to \n%s\nVisit this URL in your browser and " + "solve the CAPTCHA to continue", response.url) + raise exception.StopExtraction() + + +class KissmangaBase(RedirectMixin): """Base class for kissmanga extractors""" category = "kissmanga" archive_fmt = "{chapter_id}_{page}" root = "https://kissmanga.com" - def request(self, url): - response = super().request(url) - if response.history and "/AreYouHuman" in response.url: - self.log.error("Redirect to \n%s\n" - "Visit this URL in your browser and solve " - "the CAPTCHA to continue.", response.url) - raise exception.StopExtraction() - return response - @staticmethod def parse_chapter_string(data): """Parse 'chapter_string' value contained in 'data'""" diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 06b202ba15..dda48090ac 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -9,11 +9,12 @@ """Extract comic-issues and entire comics from https://readcomiconline.to/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, exception +from .kissmanga import RedirectMixin +from .. import text import re -class ReadcomiconlineBase(): +class ReadcomiconlineBase(RedirectMixin): """Base class for readcomiconline extractors""" category = "readcomiconline" directory_fmt = ("{category}", "{comic}", "{issue:>03}") @@ -21,15 +22,6 @@ class ReadcomiconlineBase(): archive_fmt = "{issue_id}_{page}" root = "https://readcomiconline.to" - def request(self, url): - response = super().request(url) - if response.history and "/AreYouHuman" in response.url: - self.log.error("Redirect to \n%s\n" - "Visit this URL in your browser and solve " - "the CAPTCHA to continue.", response.url) - raise exception.StopExtraction() - return response - class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): """Extractor for comic-issues from readcomiconline.to"""