Skip to content

Commit

Permalink
[keenspot] support all comics and "random" access (#223)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Jun 1, 2019
1 parent 6a34f4b commit 731c7cb
Showing 1 changed file with 70 additions and 19 deletions.
89 changes: 70 additions & 19 deletions gallery_dl/extractor/keenspot.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class KeenspotComicExtractor(Extractor):
directory_fmt = ("{category}", "{comic}")
filename_fmt = "{filename}.{extension}"
archive_fmt = "{comic}_{filename}"
pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com"
pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?"
test = (
("http://marksmen.keenspot.com/", { # link
"range": "1-3",
Expand All @@ -37,57 +37,102 @@ class KeenspotComicExtractor(Extractor):
"range": "1-3",
"url": "de21b12887ef31ff82edccbc09d112e3885c3aab"
}),
("http://twokinds.keenspot.com/comic/1066/", { # "random" access
"range": "1-3",
"url": "97e2a6ed8ba1709314f2449f84b6b1ce5db21c04",
})
)

def __init__(self, match):
Extractor.__init__(self, match)
self._next = None
self.comic = match.group(1)
self.comic = match.group(1).lower()
self.path = match.group(2)
self.root = "http://" + self.comic + ".keenspot.com"
self._needle = ""
self._image = 'class="ksc"'
self._next = self._next_needle

def items(self):
data = {"comic": self.comic}
yield Message.Version, 1
yield Message.Directory, data

url = self._first(self.request(self.root + "/").text)
while url:
if url[0] == "/":
url = self.root + url
page = self.request(url).text

for img in text.extract_iter(page, 'class="ksc"', '>'):
img = text.extract(img, 'src="', '"')[0]
if self.path:
url = self.root + self.path

ilen = len(self._image)
while url and url != "/":
page = self.request(text.urljoin(self.root, url)).text

pos = 0
while True:
pos = page.find(self._image, pos)
if pos < 0:
break
img, pos = text.extract(page, 'src="', '"', pos + ilen)
if img.endswith(".js"):
continue
if img[0] == "/":
img = self.root + img
elif "youtube.com/" in img:
img = "ytdl:" + img
yield Message.Url, img, text.nameext_from_url(img, data)

url = self._next(page)

def _first(self, page):
if self.comic == "brawlinthefamily":
self._next = self._next_brawl
self._image = '<div id="comic">'
return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"

url = text.extract(page, '<link rel="first" href="', '"')[0]
if url and not url.startswith("index"):
self._next = self._next_link
if url:
if self.comic == "porcelain":
self._needle = 'id="porArchivetop_"'
else:
self._next = self._next_link
return url

pos = page.find('id="first_day1"')
if pos >= 0:
self._next = self._next_id
return text.rextract(page, '<a href="', '"', pos)[0]
return text.rextract(page, 'href="', '"', pos)[0]

pos = page.find('>FIRST PAGE<')
if pos >= 0:
self._next = self._next_id
return text.rextract(page, '<a href="', '"', pos)[0]
if self.comic == "lastblood":
self._next = self._next_lastblood
self._image = '<div id="comic">'
else:
self._next = self._next_id
return text.rextract(page, 'href="', '"', pos)[0]

pos = page.find('<div id="kscomicpart"')
if pos >= 0:
self._next = self._next_ks
self._needle = '<a href="/archive.html'
return text.extract(page, 'href="', '"', pos)[0]

pos = page.find('>First Comic<') # twokinds
if pos >= 0:
self._image = '<article class="comic">'
self._needle = 'class="navarchive"'
return text.rextract(page, 'href="', '"', pos)[0]

pos = page.find('id="flip_FirstDay"') # flipside
if pos >= 0:
self._image = 'class="flip_Pages ksc"'
self._needle = 'id="flip_ArcButton"'
return text.rextract(page, 'href="', '"', pos)[0]

self.log.error("Unrecognized page layout")
return None

def _next_needle(self, page):
pos = page.index(self._needle) + len(self._needle)
return text.extract(page, 'href="', '"', pos)[0]

@staticmethod
def _next_link(page):
return text.extract(page, '<link rel="next" href="', '"')[0]
Expand All @@ -98,6 +143,12 @@ def _next_id(page):
return text.rextract(page, 'href="', '"', pos)[0] if pos >= 0 else None

@staticmethod
def _next_ks(page):
pos = page.index('<a href="/archive.html') + 22
return text.extract(page, 'href="', '"', pos)[0]
def _next_lastblood(page):
pos = page.index("link rel='next'")
return text.extract(page, "href='", "'", pos)[0]

@staticmethod
def _next_brawl(page):
pos = page.index("comic-nav-next")
url = text.rextract(page, 'href="', '"', pos)[0]
return None if "?random" in url else url

0 comments on commit 731c7cb

Please sign in to comment.