From 731c7cbd5ba42e2fa55186be2694e20966fc7ad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 1 Jun 2019 18:43:54 +0200 Subject: [PATCH] [keenspot] support all comics and "random" access (#223) --- gallery_dl/extractor/keenspot.py | 89 +++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 19 deletions(-) diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py index 1bcb72b5a8..8ddce24cff 100644 --- a/gallery_dl/extractor/keenspot.py +++ b/gallery_dl/extractor/keenspot.py @@ -19,7 +19,7 @@ class KeenspotComicExtractor(Extractor): directory_fmt = ("{category}", "{comic}") filename_fmt = "{filename}.{extension}" archive_fmt = "{comic}_{filename}" - pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com" + pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?" test = ( ("http://marksmen.keenspot.com/", { # link "range": "1-3", @@ -37,13 +37,20 @@ class KeenspotComicExtractor(Extractor): "range": "1-3", "url": "de21b12887ef31ff82edccbc09d112e3885c3aab" }), + ("http://twokinds.keenspot.com/comic/1066/", { # "random" access + "range": "1-3", + "url": "97e2a6ed8ba1709314f2449f84b6b1ce5db21c04", + }) ) def __init__(self, match): Extractor.__init__(self, match) - self._next = None - self.comic = match.group(1) + self.comic = match.group(1).lower() + self.path = match.group(2) self.root = "http://" + self.comic + ".keenspot.com" + self._needle = "" + self._image = 'class="ksc"' + self._next = self._next_needle def items(self): data = {"comic": self.comic} @@ -51,43 +58,81 @@ def items(self): yield Message.Directory, data url = self._first(self.request(self.root + "/").text) - while url: - if url[0] == "/": - url = self.root + url - page = self.request(url).text - - for img in text.extract_iter(page, 'class="ksc"', '>'): - img = text.extract(img, 'src="', '"')[0] + if self.path: + url = self.root + self.path + + ilen = len(self._image) + while url and url != "/": + page = self.request(text.urljoin(self.root, url)).text + + pos = 0 + while True: + pos = page.find(self._image, pos) + if pos < 0: + break + img, pos = text.extract(page, 'src="', '"', pos + ilen) + if img.endswith(".js"): + continue if img[0] == "/": img = self.root + img + elif "youtube.com/" in img: + img = "ytdl:" + img yield Message.Url, img, text.nameext_from_url(img, data) url = self._next(page) def _first(self, page): + if self.comic == "brawlinthefamily": + self._next = self._next_brawl + self._image = '
' + return "http://brawlinthefamily.keenspot.com/comic/theshowdown/" + url = text.extract(page, '= 0: self._next = self._next_id - return text.rextract(page, 'FIRST PAGE<') if pos >= 0: - self._next = self._next_id - return text.rextract(page, '= 0: - self._next = self._next_ks + self._needle = 'First Comic<') # twokinds + if pos >= 0: + self._image = '
' + self._needle = 'class="navarchive"' + return text.rextract(page, 'href="', '"', pos)[0] + + pos = page.find('id="flip_FirstDay"') # flipside + if pos >= 0: + self._image = 'class="flip_Pages ksc"' + self._needle = 'id="flip_ArcButton"' + return text.rextract(page, 'href="', '"', pos)[0] + self.log.error("Unrecognized page layout") return None + def _next_needle(self, page): + pos = page.index(self._needle) + len(self._needle) + return text.extract(page, 'href="', '"', pos)[0] + @staticmethod def _next_link(page): return text.extract(page, '= 0 else None @staticmethod - def _next_ks(page): - pos = page.index('