[tsumino] add gallery extractor (#161)

mikf · Jan 29, 2019 · 80a75a1 · 80a75a1
1 parent 2d2953a
commit 80a75a1
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 0 deletions.
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
@@ -83,6 +83,7 @@ Simply Hentai        https://www.simply-hentai.com/      Galleries, individual I
 SlideShare           https://www.slideshare.net/         Presentations
 SmugMug              https://www.smugmug.com/            |Capabilities-8|                                   Optional (OAuth)
 The /b/ Archive      https://thebarchive.com/            Threads
+Tsumino              https://www.tsumino.com/            Galleries
 Tumblr               https://www.tumblr.com/             Images from Users, Likes, Posts, Tag-Searches      Optional (OAuth)
 Twitter              https://twitter.com/                Media Timelines, Timelines, Tweets
 Wallhaven            https://alpha.wallhaven.cc/         individual Images, Search Results                  Optional

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -88,6 +88,7 @@
     "slideshare",
     "smugmug",
     "thebarchive",
+    "tsumino",
     "tumblr",
     "twitter",
     "wallhaven",

diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.tsumino.com/"""
+
+from .common import ChapterExtractor
+from .. import text
+
+
+class TsuminoGalleryExtractor(ChapterExtractor):
+    """Extractor for image galleries on tsumino.com"""
+    category = "tsumino"
+    subcategory = "gallery"
+    filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
+    directory_fmt = ["{category}", "{gallery_id} {title}"]
+    archive_fmt = "{gallery_id}_{page}"
+    pattern = [r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+               r"/(?:Book/Info|Read/View)/(\d+)"]
+    test = [
+        ("https://www.tsumino.com/Book/Info/45834", {
+            "url": "ed3e39bc21221fbd21b9a2ba711e8decb6fdc6bc",
+            "keyword": "5acc43f67c61f5312e0b5d6c9d6b1276cda438fc",
+        }),
+        ("https://www.tsumino.com/Read/View/45834", None),
+    ]
+    root = "https://www.tsumino.com"
+
+    def __init__(self, match):
+        self.gallery_id = match.group(1)
+        url = "{}/Book/Info/{}".format(self.root, self.gallery_id)
+        ChapterExtractor.__init__(self, url)
+
+        self.session.cookies.setdefault(
+            "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5")
+
+    def get_metadata(self, page):
+        extr = text.extract
+        title, pos = extr(page, '"og:title" content="', '"')
+        thumb, pos = extr(page, '"og:image" content="', '"', pos)
+        title_en, _, title_jp = text.unescape(title).partition("/")
+
+        uploader  , pos = extr(page, 'id="Uploader">'  , '</div>', pos)
+        date      , pos = extr(page, 'id="Uploaded">'  , '</div>', pos)
+        rating    , pos = extr(page, 'id="Rating">'    , '</div>', pos)
+        gtype     , pos = extr(page, 'id="Category">'  , '</div>', pos)
+        collection, pos = extr(page, 'id="Collection">', '</div>', pos)
+        group     , pos = extr(page, 'id="Group">'     , '</div>', pos)
+        artist    , pos = extr(page, 'id="Artist">'    , '</div>', pos)
+        parody    , pos = extr(page, 'id="Parody">'    , '</div>', pos)
+        character , pos = extr(page, 'id="Character">' , '</div>', pos)
+        tags      , pos = extr(page, 'id="Tag">'       , '</div>', pos)
+
+        return {
+            "gallery_id": text.parse_int(self.gallery_id),
+            "title": title_en.strip(),
+            "title_jp": title_jp.strip(),
+            "thumbnail": thumb,
+            "uploader": text.remove_html(uploader),
+            "date": date.strip(),
+            "rating": text.parse_float(rating.partition(" ")[0]),
+            "type": text.remove_html(gtype),
+            "collection": text.remove_html(collection),
+            "group": text.remove_html(group),
+            "artist": ", ".join(text.split_html(artist)),
+            "parodies": ", ".join(text.split_html(parody)),
+            "characters": ", ".join(text.split_html(character)),
+            "tags": ", ".join(text.split_html(tags)),
+            "language": "English",
+            "lang": "en",
+        }
+
+    def get_images(self, page):
+        url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
+        data = self.request(url, headers={"Referer": self.url}).json()
+        base = self.root + "/Image/Object?name="
+
+        return [
+            (base + text.quote(name), None)
+            for name in data["reader_page_urls"]
+        ]