From 80a75a1ecfa60094ad791ae182ec4392015921ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 29 Jan 2019 17:23:01 +0100 Subject: [PATCH] [tsumino] add gallery extractor (#161) --- docs/supportedsites.rst | 1 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/tsumino.py | 85 ++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 gallery_dl/extractor/tsumino.py diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index cf42f4d336..18366500a7 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -83,6 +83,7 @@ Simply Hentai https://www.simply-hentai.com/ Galleries, individual I SlideShare https://www.slideshare.net/ Presentations SmugMug https://www.smugmug.com/ |Capabilities-8| Optional (OAuth) The /b/ Archive https://thebarchive.com/ Threads +Tsumino https://www.tsumino.com/ Galleries Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth) Twitter https://twitter.com/ Media Timelines, Timelines, Tweets Wallhaven https://alpha.wallhaven.cc/ individual Images, Search Results Optional diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d76a6da30c..164f5462a8 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -88,6 +88,7 @@ "slideshare", "smugmug", "thebarchive", + "tsumino", "tumblr", "twitter", "wallhaven", diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py new file mode 100644 index 0000000000..a8170b3491 --- /dev/null +++ b/gallery_dl/extractor/tsumino.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.tsumino.com/""" + +from .common import ChapterExtractor +from .. import text + + +class TsuminoGalleryExtractor(ChapterExtractor): + """Extractor for image galleries on tsumino.com""" + category = "tsumino" + subcategory = "gallery" + filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" + directory_fmt = ["{category}", "{gallery_id} {title}"] + archive_fmt = "{gallery_id}_{page}" + pattern = [r"(?i)(?:https?://)?(?:www\.)?tsumino\.com" + r"/(?:Book/Info|Read/View)/(\d+)"] + test = [ + ("https://www.tsumino.com/Book/Info/45834", { + "url": "ed3e39bc21221fbd21b9a2ba711e8decb6fdc6bc", + "keyword": "5acc43f67c61f5312e0b5d6c9d6b1276cda438fc", + }), + ("https://www.tsumino.com/Read/View/45834", None), + ] + root = "https://www.tsumino.com" + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/Book/Info/{}".format(self.root, self.gallery_id) + ChapterExtractor.__init__(self, url) + + self.session.cookies.setdefault( + "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5") + + def get_metadata(self, page): + extr = text.extract + title, pos = extr(page, '"og:title" content="', '"') + thumb, pos = extr(page, '"og:image" content="', '"', pos) + title_en, _, title_jp = text.unescape(title).partition("/") + + uploader , pos = extr(page, 'id="Uploader">' , '', pos) + date , pos = extr(page, 'id="Uploaded">' , '', pos) + rating , pos = extr(page, 'id="Rating">' , '', pos) + gtype , pos = extr(page, 'id="Category">' , '', pos) + collection, pos = extr(page, 'id="Collection">', '', pos) + group , pos = extr(page, 'id="Group">' , '', pos) + artist , pos = extr(page, 'id="Artist">' , '', pos) + parody , pos = extr(page, 'id="Parody">' , '', pos) + character , pos = extr(page, 'id="Character">' , '', pos) + tags , pos = extr(page, 'id="Tag">' , '', pos) + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title": title_en.strip(), + "title_jp": title_jp.strip(), + "thumbnail": thumb, + "uploader": text.remove_html(uploader), + "date": date.strip(), + "rating": text.parse_float(rating.partition(" ")[0]), + "type": text.remove_html(gtype), + "collection": text.remove_html(collection), + "group": text.remove_html(group), + "artist": ", ".join(text.split_html(artist)), + "parodies": ", ".join(text.split_html(parody)), + "characters": ", ".join(text.split_html(character)), + "tags": ", ".join(text.split_html(tags)), + "language": "English", + "lang": "en", + } + + def get_images(self, page): + url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id) + data = self.request(url, headers={"Referer": self.url}).json() + base = self.root + "/Image/Object?name=" + + return [ + (base + text.quote(name), None) + for name in data["reader_page_urls"] + ]