From b8fed34548c30b3cecca4bda2b2fb24c90213fbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 19 Jan 2019 14:28:59 +0100 Subject: [PATCH] add generalized extractors for Mastodon instances (#144) Extractors for Mastodon instances can now be dynamically generated, based on the instance names in the 'extractor.mastodon.*' config path. Example: { "extractor": { "mastodon": { "pawoo.net": { ... }, "mastodon.xyz": { ... }, "tabletop.social": { ... }, ... } } } Each entry requires an 'access-token' value, which can be generated with 'gallery-dl oauth:mastodon:'. An 'access-token' (as well as a 'client-id' and 'client-secret') for pawoo.net is always available, but can be overwritten as necessary. --- CHANGELOG.md | 2 + gallery_dl/extractor/__init__.py | 4 +- gallery_dl/extractor/mastodon.py | 175 +++++++++++++++++++++++++++++++ gallery_dl/extractor/oauth.py | 90 +++++++++++++++- gallery_dl/extractor/pawoo.py | 140 ------------------------- gallery_dl/version.py | 4 +- test/test_extractor.py | 4 +- 7 files changed, 269 insertions(+), 150 deletions(-) create mode 100644 gallery_dl/extractor/mastodon.py delete mode 100644 gallery_dl/extractor/pawoo.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a86fced8d9..39628d3eb6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # Changelog +## Unreleased + ## 1.6.3 - 2019-01-18 - Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135)) - Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149)) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 48c4351d45..0247766df4 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2018 Mike Fährmann +# Copyright 2015-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -67,7 +67,6 @@ "nijie", "nyafuu", "paheal", - "pawoo", "piczel", "pinterest", "pixiv", @@ -95,6 +94,7 @@ "yandere", "xvideos", "yuki", + "mastodon", "imagehosts", "directlink", "recursive", diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py new file mode 100644 index 0000000000..16b16304f6 --- /dev/null +++ b/gallery_dl/extractor/mastodon.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for mastodon instances""" + +from .common import Extractor, Message +from .. import text, config, exception +import re + + +class MastodonExtractor(Extractor): + """Base class for mastodon extractors""" + basecategory = "mastodon" + directory_fmt = ["mastodon", "{category}", "{account[username]}"] + filename_fmt = "{category}_{id}_{media[id]}.{extension}" + archive_fmt = "{media[id]}" + instance = None + + def __init__(self, match): + Extractor.__init__(self) + self.instance = match.group(1) + self.api = MastodonAPI(self, self.instance) + + def config(self, key, default=None): + return config.interpolate( + ("extractor", "mastodon", self.category, self.subcategory, key), + default, + ) + + def items(self): + yield Message.Version, 1 + for status in self.statuses(): + attachments = self.prepare(status) + yield Message.Directory, status + for media in attachments: + status["media"] = media + url = media["url"] + yield Message.Url, url, text.nameext_from_url(url, status) + + def statuses(self): + """Return an iterable containing all relevant Status-objects""" + return () + + @staticmethod + def prepare(status): + """Prepare a status object""" + attachments = status["media_attachments"] + del status["media_attachments"] + return attachments + + +class MastodonUserExtractor(MastodonExtractor): + """Extractor for all images of an account/user""" + subcategory = "user" + + def __init__(self, match): + MastodonExtractor.__init__(self, match) + self.account_name = match.group(2) + + def statuses(self): + results = self.api.account_search("@" + self.account_name, 1) + for account in results: + if account["username"] == self.account_name: + break + else: + raise exception.NotFoundError("account") + return self.api.account_statuses(account["id"]) + + +class MastodonStatusExtractor(MastodonExtractor): + """Extractor for images from a status""" + subcategory = "status" + + def __init__(self, match): + MastodonExtractor.__init__(self, match) + self.status_id = match.group(2) + + def statuses(self): + return (self.api.status(self.status_id),) + + +class MastodonAPI(): + """Minimal interface for the Mastodon API + + https://github.com/tootsuite/mastodon + https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md + """ + + def __init__(self, extractor, instance, access_token=None): + self.instance = instance + self.extractor = extractor + self.headers = {"Authorization": "Bearer {}".format( + extractor.config("access-token", access_token))} + + def account_search(self, query, limit=40): + """Search for content""" + params = {"q": query, "limit": limit} + return self._call("accounts/search", params) + + def account_statuses(self, account_id): + """Get an account's statuses""" + endpoint = "accounts/{}/statuses".format(account_id) + params = {"only_media": "1"} + return self._pagination(endpoint, params) + + def status(self, status_id): + """Fetch a Status""" + return self._call("statuses/" + status_id) + + def _call(self, endpoint, params=None): + url = "https://{}/api/v1/{}".format(self.instance, endpoint) + response = self.extractor.request( + url, params=params, headers=self.headers) + return self._parse(response) + + def _pagination(self, endpoint, params): + url = "https://{}/api/v1/{}".format(self.instance, endpoint) + while url: + response = self.extractor.request( + url, params=params, headers=self.headers) + yield from self._parse(response) + url = response.links.get("next", {}).get("url") + + @staticmethod + def _parse(response): + """Parse an API response""" + if response.status_code == 404: + raise exception.NotFoundError() + return response.json() + + +def generate_extractors(): + """Dynamically generate Extractor classes for Mastodon instances""" + + symtable = globals() + mastodon = config.get(("extractor", "mastodon")) or {} + + if "pawoo.net" not in mastodon: + mastodon["pawoo.net"] = { + "access-token" : "286462927198d0cf3e24683e91c8259a" + "ac4367233064e0570ca18df2ac65b226", + "client-id" : "97b142b6904abf97a1068d51a7bc2f2f" + "cf9323cef81f13cb505415716dba7dac", + "client-secret": "e45bef4bad45b38abf7d9ef88a646b73" + "75e7fb2532c31a026327a93549236481", + } + + for instance, info in mastodon.items(): + + if not isinstance(info, dict): + continue + + class UserExtractor(MastodonUserExtractor): + pattern = [r"(?:https?://)?({})/@([^/?&#]+)(?:/media)?/?$".format( + re.escape(instance))] + + class StatusExtractor(MastodonStatusExtractor): + pattern = [r"(?:https?://)?({})/@[^/?&#]+/(\d+)".format( + re.escape(instance))] + + name = re.sub(r"[^A-Za-z]+", "", instance).capitalize() + + for extr in (UserExtractor, StatusExtractor): + extr.category = instance + extr.__name__ = name + extr.__name__ + extr.__doc__ = "{} on {}".format(extr.__base__.__doc__, instance) + symtable[extr.__name__] = extr + + +generate_extractors() diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 7c68254129..0c3a360618 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2018 Mike Fährmann +# Copyright 2017-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,8 @@ from .common import Extractor, Message from . import deviantart, flickr, reddit, smugmug, tumblr -from .. import text, oauth, config +from .. import text, oauth, config, exception +from ..cache import cache import os import urllib.parse @@ -82,7 +83,6 @@ def _oauth1_authorization_flow( data = self.open(authorize_url, params) # exchange the request token for an access token - # self.session.token = data["oauth_token"] data = self.session.get(access_token_url, params=data).text data = text.parse_query(data) @@ -94,7 +94,8 @@ def _oauth1_authorization_flow( def _oauth2_authorization_code_grant( self, client_id, client_secret, auth_url, token_url, - scope="read", key="refresh_token", auth=True): + scope="read", key="refresh_token", auth=True, + message_template=None): """Perform an OAuth2 authorization code grant""" state = "gallery-dl_{}_{}".format( @@ -147,11 +148,15 @@ def _oauth2_authorization_code_grant( # display token part = key.partition("_")[0] - self.send(OAUTH2_MSG_TEMPLATE.format( + template = message_template or OAUTH2_MSG_TEMPLATE + self.send(template.format( category=self.subcategory, key=part, Key=part.capitalize(), token=data[key], + instance=getattr(self, "instance", ""), + client_id=client_id, + client_secret=client_secret, )) @@ -254,6 +259,55 @@ def items(self): ) +class OAuthMastodon(OAuthBase): + subcategory = "mastodon" + pattern = ["oauth:mastodon:(?:https?://)?([^/?&#]+)"] + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.instance = match.group(1) + + def items(self): + yield Message.Version, 1 + + application = self.oauth_config(self.instance) + if not application: + application = self._register(self.instance) + + self._oauth2_authorization_code_grant( + application["client-id"], + application["client-secret"], + "https://{}/oauth/authorize".format(self.instance), + "https://{}/oauth/token".format(self.instance), + key="access_token", + message_template=MASTODON_MSG_TEMPLATE, + ) + + @cache(maxage=10*365*24*60*60, keyarg=1) + def _register(self, instance): + self.log.info("Registering application for '%s'", instance) + + url = "https://{}/api/v1/apps".format(instance) + data = { + "client_name": "gdl:" + oauth.nonce(8), + "redirect_uris": self.redirect_uri, + "scopes": "read", + } + data = self.session.post(url, data=data).json() + + if "client_id" not in data or "client_secret" not in data: + self.log.error("Failed to register new application: '%s'", data) + raise exception.StopExtraction() + + data["client-id"] = data.pop("client_id") + data["client-secret"] = data.pop("client_secret") + + self.log.info("client-id:\n%s", data["client-id"]) + self.log.info("client-secret:\n%s", data["client-secret"]) + + return data + + OAUTH1_MSG_TEMPLATE = """ Your Access Token and Access Token Secret are @@ -293,3 +347,29 @@ def items(self): }} }} """ + + +MASTODON_MSG_TEMPLATE = """ +Your {Key} Token is + +{token} + +Put this value into your configuration file as +'extractor.mastodon.{instance}.{key}-token'. + +You can also add your 'client-id' and 'client-secret' values +if you want to register another account in the future. + +Example: +{{ + "extractor": {{ + "mastodon": {{ + "{instance}": {{ + "{key}-token": "{token}", + "client-id": "{client_id}", + "client-secret": "{client_secret}" + }} + }} + }} +}} +""" diff --git a/gallery_dl/extractor/pawoo.py b/gallery_dl/extractor/pawoo.py deleted file mode 100644 index 23f8af9a2c..0000000000 --- a/gallery_dl/extractor/pawoo.py +++ /dev/null @@ -1,140 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2017-2018 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract images from https://pawoo.net""" - -from .common import Extractor, Message -from .. import text, exception - - -class PawooExtractor(Extractor): - """Base class for pawoo extractors""" - category = "pawoo" - directory_fmt = ["{category}", "{account[username]}"] - filename_fmt = "{category}_{id}_{media[id]}.{extension}" - archive_fmt = "{media[id]}" - - def __init__(self): - Extractor.__init__(self) - self.api = MastodonAPI(self) - - def items(self): - yield Message.Version, 1 - for status in self.statuses(): - attachments = self.prepare(status) - yield Message.Directory, status - for media in attachments: - status["media"] = media - url = media["url"] - yield Message.Url, url, text.nameext_from_url(url, status) - - def statuses(self): - """Return an iterable containing all relevant Status-objects""" - return [] - - @staticmethod - def prepare(status): - """Prepare a status object""" - attachments = status["media_attachments"] - del status["media_attachments"] - return attachments - - -class PawooUserExtractor(PawooExtractor): - """Extractor for all images of an account/user on pawoo.net""" - subcategory = "user" - pattern = [r"(?:https?://)?pawoo\.net/@([^/?&#]+)(?:/media)?/?$"] - test = [ - ("https://pawoo.net/@kuroda", { - "url": "a3f9e7555f2b024554c0e9b6cbcc7991af13cf99", - }), - ("https://pawoo.net/@zZzZz/", { - "exception": exception.NotFoundError, - }), - ("https://pawoo.net/@kuroda/media", None), - ] - - def __init__(self, match): - PawooExtractor.__init__(self) - self.account_name = match.group(1) - - def statuses(self): - results = self.api.account_search("@" + self.account_name, 1) - for account in results: - if account["username"] == self.account_name: - break - else: - raise exception.NotFoundError("account") - return self.api.account_statuses(account["id"]) - - -class PawooStatusExtractor(PawooExtractor): - """Extractor for images from a status on pawoo.net""" - subcategory = "status" - pattern = [r"(?:https?://)?pawoo\.net/@[^/?&#]+/(\d+)"] - test = [ - ("https://pawoo.net/@takehana_note/559043", { - "url": "f95cc8c0274c4143e7e21dbdc693b90c65b596e3", - "content": "3b148cf90174173355fe34179741ce476921b2fc", - }), - ("https://pawoo.net/@zZzZz/12346", { - "exception": exception.NotFoundError, - }), - ] - - def __init__(self, match): - PawooExtractor.__init__(self) - self.status_id = match.group(1) - - def statuses(self): - return (self.api.status(self.status_id),) - - -class MastodonAPI(): - """Minimal interface for the Mastodon API on pawoo.net - - https://github.com/tootsuite/mastodon - https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md - """ - - def __init__(self, extractor, root="https://pawoo.net", - access_token=("286462927198d0cf3e24683e91c8259a" - "ac4367233064e0570ca18df2ac65b226")): - self.root = root - self.extractor = extractor - extractor.session.headers["Authorization"] = "Bearer {}".format( - extractor.config("access-token", access_token)) - - def account_search(self, query, limit=40): - """Search for content""" - url = "{}/api/v1/accounts/search".format(self.root) - params = {"q": query, "limit": limit} - response = self.extractor.request(url, params=params) - return self._parse(response) - - def account_statuses(self, account_id): - """Get an account's statuses""" - url = "{}/api/v1/accounts/{}/statuses?only_media=1".format( - self.root, account_id) - while url: - response = self.extractor.request(url) - yield from self._parse(response) - url = response.links.get("next", {}).get("url") - - def status(self, status_id): - """Fetch a Status""" - url = "{}/api/v1/statuses/{}".format(self.root, status_id) - response = self.extractor.request(url, expect=(404,)) - return self._parse(response) - - @staticmethod - def _parse(response): - """Parse an API response""" - if response.status_code == 404: - raise exception.NotFoundError() - return response.json() diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 21d07e5752..d7e8ea2960 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2018 Mike Fährmann +# Copyright 2016-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.6.3" +__version__ = "1.7.0-dev" diff --git a/test/test_extractor.py b/test/test_extractor.py index cc9a93cdb7..d15c23323b 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2018 Mike Fährmann +# Copyright 2018-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -149,6 +149,8 @@ def test_names(self): def capitalize(c): if "-" in c: return string.capwords(c.replace("-", " ")).replace(" ", "") + if "." in c: + c = c.replace(".", "") return c.capitalize() mapping = {