add generalized extractors for Mastodon instances (#144)

Extractors for Mastodon instances can now be dynamically generated, based on the instance names in the 'extractor.mastodon.*' config path. Example: { "extractor": { "mastodon": { "pawoo.net": { ... }, "mastodon.xyz": { ... }, "tabletop.social": { ... }, ... } } } Each entry requires an 'access-token' value, which can be generated with 'gallery-dl oauth:mastodon:<instance URL>'. An 'access-token' (as well as a 'client-id' and 'client-secret') for pawoo.net is always available, but can be overwritten as necessary.
mikf · Jan 19, 2019 · b8fed34 · b8fed34
1 parent 4b441c1
commit b8fed34
Show file tree

Hide file tree

Showing 7 changed files with 269 additions and 150 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,7 @@
 # Changelog
 
+## Unreleased
+
 ## 1.6.3 - 2019-01-18
 - Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135))
 - Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149))

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2015-2018 Mike Fährmann
+# Copyright 2015-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -67,7 +67,6 @@
     "nijie",
     "nyafuu",
     "paheal",
-    "pawoo",
     "piczel",
     "pinterest",
     "pixiv",
@@ -95,6 +94,7 @@
     "yandere",
     "xvideos",
     "yuki",
+    "mastodon",
     "imagehosts",
     "directlink",
     "recursive",

diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for mastodon instances"""
+
+from .common import Extractor, Message
+from .. import text, config, exception
+import re
+
+
+class MastodonExtractor(Extractor):
+    """Base class for mastodon extractors"""
+    basecategory = "mastodon"
+    directory_fmt = ["mastodon", "{category}", "{account[username]}"]
+    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
+    archive_fmt = "{media[id]}"
+    instance = None
+
+    def __init__(self, match):
+        Extractor.__init__(self)
+        self.instance = match.group(1)
+        self.api = MastodonAPI(self, self.instance)
+
+    def config(self, key, default=None):
+        return config.interpolate(
+            ("extractor", "mastodon", self.category, self.subcategory, key),
+            default,
+        )
+
+    def items(self):
+        yield Message.Version, 1
+        for status in self.statuses():
+            attachments = self.prepare(status)
+            yield Message.Directory, status
+            for media in attachments:
+                status["media"] = media
+                url = media["url"]
+                yield Message.Url, url, text.nameext_from_url(url, status)
+
+    def statuses(self):
+        """Return an iterable containing all relevant Status-objects"""
+        return ()
+
+    @staticmethod
+    def prepare(status):
+        """Prepare a status object"""
+        attachments = status["media_attachments"]
+        del status["media_attachments"]
+        return attachments
+
+
+class MastodonUserExtractor(MastodonExtractor):
+    """Extractor for all images of an account/user"""
+    subcategory = "user"
+
+    def __init__(self, match):
+        MastodonExtractor.__init__(self, match)
+        self.account_name = match.group(2)
+
+    def statuses(self):
+        results = self.api.account_search("@" + self.account_name, 1)
+        for account in results:
+            if account["username"] == self.account_name:
+                break
+        else:
+            raise exception.NotFoundError("account")
+        return self.api.account_statuses(account["id"])
+
+
+class MastodonStatusExtractor(MastodonExtractor):
+    """Extractor for images from a status"""
+    subcategory = "status"
+
+    def __init__(self, match):
+        MastodonExtractor.__init__(self, match)
+        self.status_id = match.group(2)
+
+    def statuses(self):
+        return (self.api.status(self.status_id),)
+
+
+class MastodonAPI():
+    """Minimal interface for the Mastodon API
+
+    https://github.com/tootsuite/mastodon
+    https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
+    """
+
+    def __init__(self, extractor, instance, access_token=None):
+        self.instance = instance
+        self.extractor = extractor
+        self.headers = {"Authorization": "Bearer {}".format(
+            extractor.config("access-token", access_token))}
+
+    def account_search(self, query, limit=40):
+        """Search for content"""
+        params = {"q": query, "limit": limit}
+        return self._call("accounts/search", params)
+
+    def account_statuses(self, account_id):
+        """Get an account's statuses"""
+        endpoint = "accounts/{}/statuses".format(account_id)
+        params = {"only_media": "1"}
+        return self._pagination(endpoint, params)
+
+    def status(self, status_id):
+        """Fetch a Status"""
+        return self._call("statuses/" + status_id)
+
+    def _call(self, endpoint, params=None):
+        url = "https://{}/api/v1/{}".format(self.instance, endpoint)
+        response = self.extractor.request(
+            url, params=params, headers=self.headers)
+        return self._parse(response)
+
+    def _pagination(self, endpoint, params):
+        url = "https://{}/api/v1/{}".format(self.instance, endpoint)
+        while url:
+            response = self.extractor.request(
+                url, params=params, headers=self.headers)
+            yield from self._parse(response)
+            url = response.links.get("next", {}).get("url")
+
+    @staticmethod
+    def _parse(response):
+        """Parse an API response"""
+        if response.status_code == 404:
+            raise exception.NotFoundError()
+        return response.json()
+
+
+def generate_extractors():
+    """Dynamically generate Extractor classes for Mastodon instances"""
+
+    symtable = globals()
+    mastodon = config.get(("extractor", "mastodon")) or {}
+
+    if "pawoo.net" not in mastodon:
+        mastodon["pawoo.net"] = {
+            "access-token" : "286462927198d0cf3e24683e91c8259a"
+                             "ac4367233064e0570ca18df2ac65b226",
+            "client-id"    : "97b142b6904abf97a1068d51a7bc2f2f"
+                             "cf9323cef81f13cb505415716dba7dac",
+            "client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
+                             "75e7fb2532c31a026327a93549236481",
+        }
+
+    for instance, info in mastodon.items():
+
+        if not isinstance(info, dict):
+            continue
+
+        class UserExtractor(MastodonUserExtractor):
+            pattern = [r"(?:https?://)?({})/@([^/?&#]+)(?:/media)?/?$".format(
+                re.escape(instance))]
+
+        class StatusExtractor(MastodonStatusExtractor):
+            pattern = [r"(?:https?://)?({})/@[^/?&#]+/(\d+)".format(
+                re.escape(instance))]
+
+        name = re.sub(r"[^A-Za-z]+", "", instance).capitalize()
+
+        for extr in (UserExtractor, StatusExtractor):
+            extr.category = instance
+            extr.__name__ = name + extr.__name__
+            extr.__doc__ = "{} on {}".format(extr.__base__.__doc__, instance)
+            symtable[extr.__name__] = extr
+
+
+generate_extractors()
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2017-2018 Mike Fährmann
+# Copyright 2017-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,8 @@
 
 from .common import Extractor, Message
 from . import deviantart, flickr, reddit, smugmug, tumblr
-from .. import text, oauth, config
+from .. import text, oauth, config, exception
+from ..cache import cache
 import os
 import urllib.parse
 
@@ -82,7 +83,6 @@ def _oauth1_authorization_flow(
         data = self.open(authorize_url, params)
 
         # exchange the request token for an access token
-        # self.session.token = data["oauth_token"]
         data = self.session.get(access_token_url, params=data).text
 
         data = text.parse_query(data)
@@ -94,7 +94,8 @@ def _oauth1_authorization_flow(
 
     def _oauth2_authorization_code_grant(
             self, client_id, client_secret, auth_url, token_url,
-            scope="read", key="refresh_token", auth=True):
+            scope="read", key="refresh_token", auth=True,
+            message_template=None):
         """Perform an OAuth2 authorization code grant"""
 
         state = "gallery-dl_{}_{}".format(
@@ -147,11 +148,15 @@ def _oauth2_authorization_code_grant(
 
         # display token
         part = key.partition("_")[0]
-        self.send(OAUTH2_MSG_TEMPLATE.format(
+        template = message_template or OAUTH2_MSG_TEMPLATE
+        self.send(template.format(
             category=self.subcategory,
             key=part,
             Key=part.capitalize(),
             token=data[key],
+            instance=getattr(self, "instance", ""),
+            client_id=client_id,
+            client_secret=client_secret,
         ))
 
 
@@ -254,6 +259,55 @@ def items(self):
         )
 
 
+class OAuthMastodon(OAuthBase):
+    subcategory = "mastodon"
+    pattern = ["oauth:mastodon:(?:https?://)?([^/?&#]+)"]
+
+    def __init__(self, match):
+        OAuthBase.__init__(self, match)
+        self.instance = match.group(1)
+
+    def items(self):
+        yield Message.Version, 1
+
+        application = self.oauth_config(self.instance)
+        if not application:
+            application = self._register(self.instance)
+
+        self._oauth2_authorization_code_grant(
+            application["client-id"],
+            application["client-secret"],
+            "https://{}/oauth/authorize".format(self.instance),
+            "https://{}/oauth/token".format(self.instance),
+            key="access_token",
+            message_template=MASTODON_MSG_TEMPLATE,
+        )
+
+    @cache(maxage=10*365*24*60*60, keyarg=1)
+    def _register(self, instance):
+        self.log.info("Registering application for '%s'", instance)
+
+        url = "https://{}/api/v1/apps".format(instance)
+        data = {
+            "client_name": "gdl:" + oauth.nonce(8),
+            "redirect_uris": self.redirect_uri,
+            "scopes": "read",
+        }
+        data = self.session.post(url, data=data).json()
+
+        if "client_id" not in data or "client_secret" not in data:
+            self.log.error("Failed to register new application: '%s'", data)
+            raise exception.StopExtraction()
+
+        data["client-id"] = data.pop("client_id")
+        data["client-secret"] = data.pop("client_secret")
+
+        self.log.info("client-id:\n%s", data["client-id"])
+        self.log.info("client-secret:\n%s", data["client-secret"])
+
+        return data
+
+
 OAUTH1_MSG_TEMPLATE = """
 Your Access Token and Access Token Secret are
 
@@ -293,3 +347,29 @@ def items(self):
     }}
 }}
 """
+
+
+MASTODON_MSG_TEMPLATE = """
+Your {Key} Token is
+
+{token}
+
+Put this value into your configuration file as
+'extractor.mastodon.{instance}.{key}-token'.
+
+You can also add your 'client-id' and 'client-secret' values
+if you want to register another account in the future.
+
+Example:
+{{
+    "extractor": {{
+        "mastodon": {{
+            "{instance}": {{
+                "{key}-token": "{token}",
+                "client-id": "{client_id}",
+                "client-secret": "{client_secret}"
+            }}
+        }}
+    }}
+}}
+"""