From b8fed34548c30b3cecca4bda2b2fb24c90213fbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Sat, 19 Jan 2019 14:28:59 +0100
Subject: [PATCH] add generalized extractors for Mastodon instances (#144)

Extractors for Mastodon instances can now be dynamically generated,
based on the instance names in the 'extractor.mastodon.*' config path.

Example:
{
    "extractor": {
        "mastodon": {
            "pawoo.net": { ... },
            "mastodon.xyz": { ... },
            "tabletop.social": { ... },
            ...
        }
    }
}

Each entry requires an 'access-token' value, which can be generated with
'gallery-dl oauth:mastodon:<instance URL>'.
An 'access-token' (as well as a 'client-id' and 'client-secret') for
pawoo.net is always available, but can be overwritten as necessary.
---
 CHANGELOG.md                     |   2 +
 gallery_dl/extractor/__init__.py |   4 +-
 gallery_dl/extractor/mastodon.py | 175 +++++++++++++++++++++++++++++++
 gallery_dl/extractor/oauth.py    |  90 +++++++++++++++-
 gallery_dl/extractor/pawoo.py    | 140 -------------------------
 gallery_dl/version.py            |   4 +-
 test/test_extractor.py           |   4 +-
 7 files changed, 269 insertions(+), 150 deletions(-)
 create mode 100644 gallery_dl/extractor/mastodon.py
 delete mode 100644 gallery_dl/extractor/pawoo.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a86fced8d9..39628d3eb6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
 # Changelog
 
+## Unreleased
+
 ## 1.6.3 - 2019-01-18
 - Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135))
 - Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149))
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 48c4351d45..0247766df4 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2015-2018 Mike Fährmann
+# Copyright 2015-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -67,7 +67,6 @@
     "nijie",
     "nyafuu",
     "paheal",
-    "pawoo",
     "piczel",
     "pinterest",
     "pixiv",
@@ -95,6 +94,7 @@
     "yandere",
     "xvideos",
     "yuki",
+    "mastodon",
     "imagehosts",
     "directlink",
     "recursive",
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
new file mode 100644
index 0000000000..16b16304f6
--- /dev/null
+++ b/gallery_dl/extractor/mastodon.py
@@ -0,0 +1,175 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for mastodon instances"""
+
+from .common import Extractor, Message
+from .. import text, config, exception
+import re
+
+
+class MastodonExtractor(Extractor):
+    """Base class for mastodon extractors"""
+    basecategory = "mastodon"
+    directory_fmt = ["mastodon", "{category}", "{account[username]}"]
+    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
+    archive_fmt = "{media[id]}"
+    instance = None
+
+    def __init__(self, match):
+        Extractor.__init__(self)
+        self.instance = match.group(1)
+        self.api = MastodonAPI(self, self.instance)
+
+    def config(self, key, default=None):
+        return config.interpolate(
+            ("extractor", "mastodon", self.category, self.subcategory, key),
+            default,
+        )
+
+    def items(self):
+        yield Message.Version, 1
+        for status in self.statuses():
+            attachments = self.prepare(status)
+            yield Message.Directory, status
+            for media in attachments:
+                status["media"] = media
+                url = media["url"]
+                yield Message.Url, url, text.nameext_from_url(url, status)
+
+    def statuses(self):
+        """Return an iterable containing all relevant Status-objects"""
+        return ()
+
+    @staticmethod
+    def prepare(status):
+        """Prepare a status object"""
+        attachments = status["media_attachments"]
+        del status["media_attachments"]
+        return attachments
+
+
+class MastodonUserExtractor(MastodonExtractor):
+    """Extractor for all images of an account/user"""
+    subcategory = "user"
+
+    def __init__(self, match):
+        MastodonExtractor.__init__(self, match)
+        self.account_name = match.group(2)
+
+    def statuses(self):
+        results = self.api.account_search("@" + self.account_name, 1)
+        for account in results:
+            if account["username"] == self.account_name:
+                break
+        else:
+            raise exception.NotFoundError("account")
+        return self.api.account_statuses(account["id"])
+
+
+class MastodonStatusExtractor(MastodonExtractor):
+    """Extractor for images from a status"""
+    subcategory = "status"
+
+    def __init__(self, match):
+        MastodonExtractor.__init__(self, match)
+        self.status_id = match.group(2)
+
+    def statuses(self):
+        return (self.api.status(self.status_id),)
+
+
+class MastodonAPI():
+    """Minimal interface for the Mastodon API
+
+    https://github.com/tootsuite/mastodon
+    https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
+    """
+
+    def __init__(self, extractor, instance, access_token=None):
+        self.instance = instance
+        self.extractor = extractor
+        self.headers = {"Authorization": "Bearer {}".format(
+            extractor.config("access-token", access_token))}
+
+    def account_search(self, query, limit=40):
+        """Search for content"""
+        params = {"q": query, "limit": limit}
+        return self._call("accounts/search", params)
+
+    def account_statuses(self, account_id):
+        """Get an account's statuses"""
+        endpoint = "accounts/{}/statuses".format(account_id)
+        params = {"only_media": "1"}
+        return self._pagination(endpoint, params)
+
+    def status(self, status_id):
+        """Fetch a Status"""
+        return self._call("statuses/" + status_id)
+
+    def _call(self, endpoint, params=None):
+        url = "https://{}/api/v1/{}".format(self.instance, endpoint)
+        response = self.extractor.request(
+            url, params=params, headers=self.headers)
+        return self._parse(response)
+
+    def _pagination(self, endpoint, params):
+        url = "https://{}/api/v1/{}".format(self.instance, endpoint)
+        while url:
+            response = self.extractor.request(
+                url, params=params, headers=self.headers)
+            yield from self._parse(response)
+            url = response.links.get("next", {}).get("url")
+
+    @staticmethod
+    def _parse(response):
+        """Parse an API response"""
+        if response.status_code == 404:
+            raise exception.NotFoundError()
+        return response.json()
+
+
+def generate_extractors():
+    """Dynamically generate Extractor classes for Mastodon instances"""
+
+    symtable = globals()
+    mastodon = config.get(("extractor", "mastodon")) or {}
+
+    if "pawoo.net" not in mastodon:
+        mastodon["pawoo.net"] = {
+            "access-token" : "286462927198d0cf3e24683e91c8259a"
+                             "ac4367233064e0570ca18df2ac65b226",
+            "client-id"    : "97b142b6904abf97a1068d51a7bc2f2f"
+                             "cf9323cef81f13cb505415716dba7dac",
+            "client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
+                             "75e7fb2532c31a026327a93549236481",
+        }
+
+    for instance, info in mastodon.items():
+
+        if not isinstance(info, dict):
+            continue
+
+        class UserExtractor(MastodonUserExtractor):
+            pattern = [r"(?:https?://)?({})/@([^/?&#]+)(?:/media)?/?$".format(
+                re.escape(instance))]
+
+        class StatusExtractor(MastodonStatusExtractor):
+            pattern = [r"(?:https?://)?({})/@[^/?&#]+/(\d+)".format(
+                re.escape(instance))]
+
+        name = re.sub(r"[^A-Za-z]+", "", instance).capitalize()
+
+        for extr in (UserExtractor, StatusExtractor):
+            extr.category = instance
+            extr.__name__ = name + extr.__name__
+            extr.__doc__ = "{} on {}".format(extr.__base__.__doc__, instance)
+            symtable[extr.__name__] = extr
+
+
+generate_extractors()
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 7c68254129..0c3a360618 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2017-2018 Mike Fährmann
+# Copyright 2017-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,8 @@
 
 from .common import Extractor, Message
 from . import deviantart, flickr, reddit, smugmug, tumblr
-from .. import text, oauth, config
+from .. import text, oauth, config, exception
+from ..cache import cache
 import os
 import urllib.parse
 
@@ -82,7 +83,6 @@ def _oauth1_authorization_flow(
         data = self.open(authorize_url, params)
 
         # exchange the request token for an access token
-        # self.session.token = data["oauth_token"]
         data = self.session.get(access_token_url, params=data).text
 
         data = text.parse_query(data)
@@ -94,7 +94,8 @@ def _oauth1_authorization_flow(
 
     def _oauth2_authorization_code_grant(
             self, client_id, client_secret, auth_url, token_url,
-            scope="read", key="refresh_token", auth=True):
+            scope="read", key="refresh_token", auth=True,
+            message_template=None):
         """Perform an OAuth2 authorization code grant"""
 
         state = "gallery-dl_{}_{}".format(
@@ -147,11 +148,15 @@ def _oauth2_authorization_code_grant(
 
         # display token
         part = key.partition("_")[0]
-        self.send(OAUTH2_MSG_TEMPLATE.format(
+        template = message_template or OAUTH2_MSG_TEMPLATE
+        self.send(template.format(
             category=self.subcategory,
             key=part,
             Key=part.capitalize(),
             token=data[key],
+            instance=getattr(self, "instance", ""),
+            client_id=client_id,
+            client_secret=client_secret,
         ))
 
 
@@ -254,6 +259,55 @@ def items(self):
         )
 
 
+class OAuthMastodon(OAuthBase):
+    subcategory = "mastodon"
+    pattern = ["oauth:mastodon:(?:https?://)?([^/?&#]+)"]
+
+    def __init__(self, match):
+        OAuthBase.__init__(self, match)
+        self.instance = match.group(1)
+
+    def items(self):
+        yield Message.Version, 1
+
+        application = self.oauth_config(self.instance)
+        if not application:
+            application = self._register(self.instance)
+
+        self._oauth2_authorization_code_grant(
+            application["client-id"],
+            application["client-secret"],
+            "https://{}/oauth/authorize".format(self.instance),
+            "https://{}/oauth/token".format(self.instance),
+            key="access_token",
+            message_template=MASTODON_MSG_TEMPLATE,
+        )
+
+    @cache(maxage=10*365*24*60*60, keyarg=1)
+    def _register(self, instance):
+        self.log.info("Registering application for '%s'", instance)
+
+        url = "https://{}/api/v1/apps".format(instance)
+        data = {
+            "client_name": "gdl:" + oauth.nonce(8),
+            "redirect_uris": self.redirect_uri,
+            "scopes": "read",
+        }
+        data = self.session.post(url, data=data).json()
+
+        if "client_id" not in data or "client_secret" not in data:
+            self.log.error("Failed to register new application: '%s'", data)
+            raise exception.StopExtraction()
+
+        data["client-id"] = data.pop("client_id")
+        data["client-secret"] = data.pop("client_secret")
+
+        self.log.info("client-id:\n%s", data["client-id"])
+        self.log.info("client-secret:\n%s", data["client-secret"])
+
+        return data
+
+
 OAUTH1_MSG_TEMPLATE = """
 Your Access Token and Access Token Secret are
 
@@ -293,3 +347,29 @@ def items(self):
     }}
 }}
 """
+
+
+MASTODON_MSG_TEMPLATE = """
+Your {Key} Token is
+
+{token}
+
+Put this value into your configuration file as
+'extractor.mastodon.{instance}.{key}-token'.
+
+You can also add your 'client-id' and 'client-secret' values
+if you want to register another account in the future.
+
+Example:
+{{
+    "extractor": {{
+        "mastodon": {{
+            "{instance}": {{
+                "{key}-token": "{token}",
+                "client-id": "{client_id}",
+                "client-secret": "{client_secret}"
+            }}
+        }}
+    }}
+}}
+"""
diff --git a/gallery_dl/extractor/pawoo.py b/gallery_dl/extractor/pawoo.py
deleted file mode 100644
index 23f8af9a2c..0000000000
--- a/gallery_dl/extractor/pawoo.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017-2018 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://pawoo.net"""
-
-from .common import Extractor, Message
-from .. import text, exception
-
-
-class PawooExtractor(Extractor):
-    """Base class for pawoo extractors"""
-    category = "pawoo"
-    directory_fmt = ["{category}", "{account[username]}"]
-    filename_fmt = "{category}_{id}_{media[id]}.{extension}"
-    archive_fmt = "{media[id]}"
-
-    def __init__(self):
-        Extractor.__init__(self)
-        self.api = MastodonAPI(self)
-
-    def items(self):
-        yield Message.Version, 1
-        for status in self.statuses():
-            attachments = self.prepare(status)
-            yield Message.Directory, status
-            for media in attachments:
-                status["media"] = media
-                url = media["url"]
-                yield Message.Url, url, text.nameext_from_url(url, status)
-
-    def statuses(self):
-        """Return an iterable containing all relevant Status-objects"""
-        return []
-
-    @staticmethod
-    def prepare(status):
-        """Prepare a status object"""
-        attachments = status["media_attachments"]
-        del status["media_attachments"]
-        return attachments
-
-
-class PawooUserExtractor(PawooExtractor):
-    """Extractor for all images of an account/user on pawoo.net"""
-    subcategory = "user"
-    pattern = [r"(?:https?://)?pawoo\.net/@([^/?&#]+)(?:/media)?/?$"]
-    test = [
-        ("https://pawoo.net/@kuroda", {
-            "url": "a3f9e7555f2b024554c0e9b6cbcc7991af13cf99",
-        }),
-        ("https://pawoo.net/@zZzZz/", {
-            "exception": exception.NotFoundError,
-        }),
-        ("https://pawoo.net/@kuroda/media", None),
-    ]
-
-    def __init__(self, match):
-        PawooExtractor.__init__(self)
-        self.account_name = match.group(1)
-
-    def statuses(self):
-        results = self.api.account_search("@" + self.account_name, 1)
-        for account in results:
-            if account["username"] == self.account_name:
-                break
-        else:
-            raise exception.NotFoundError("account")
-        return self.api.account_statuses(account["id"])
-
-
-class PawooStatusExtractor(PawooExtractor):
-    """Extractor for images from a status on pawoo.net"""
-    subcategory = "status"
-    pattern = [r"(?:https?://)?pawoo\.net/@[^/?&#]+/(\d+)"]
-    test = [
-        ("https://pawoo.net/@takehana_note/559043", {
-            "url": "f95cc8c0274c4143e7e21dbdc693b90c65b596e3",
-            "content": "3b148cf90174173355fe34179741ce476921b2fc",
-        }),
-        ("https://pawoo.net/@zZzZz/12346", {
-            "exception": exception.NotFoundError,
-        }),
-    ]
-
-    def __init__(self, match):
-        PawooExtractor.__init__(self)
-        self.status_id = match.group(1)
-
-    def statuses(self):
-        return (self.api.status(self.status_id),)
-
-
-class MastodonAPI():
-    """Minimal interface for the Mastodon API on pawoo.net
-
-    https://github.com/tootsuite/mastodon
-    https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
-    """
-
-    def __init__(self, extractor, root="https://pawoo.net",
-                 access_token=("286462927198d0cf3e24683e91c8259a"
-                               "ac4367233064e0570ca18df2ac65b226")):
-        self.root = root
-        self.extractor = extractor
-        extractor.session.headers["Authorization"] = "Bearer {}".format(
-            extractor.config("access-token", access_token))
-
-    def account_search(self, query, limit=40):
-        """Search for content"""
-        url = "{}/api/v1/accounts/search".format(self.root)
-        params = {"q": query, "limit": limit}
-        response = self.extractor.request(url, params=params)
-        return self._parse(response)
-
-    def account_statuses(self, account_id):
-        """Get an account's statuses"""
-        url = "{}/api/v1/accounts/{}/statuses?only_media=1".format(
-            self.root, account_id)
-        while url:
-            response = self.extractor.request(url)
-            yield from self._parse(response)
-            url = response.links.get("next", {}).get("url")
-
-    def status(self, status_id):
-        """Fetch a Status"""
-        url = "{}/api/v1/statuses/{}".format(self.root, status_id)
-        response = self.extractor.request(url, expect=(404,))
-        return self._parse(response)
-
-    @staticmethod
-    def _parse(response):
-        """Parse an API response"""
-        if response.status_code == 404:
-            raise exception.NotFoundError()
-        return response.json()
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 21d07e5752..d7e8ea2960 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
 
-# Copyright 2016-2018 Mike Fährmann
+# Copyright 2016-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-__version__ = "1.6.3"
+__version__ = "1.7.0-dev"
diff --git a/test/test_extractor.py b/test/test_extractor.py
index cc9a93cdb7..d15c23323b 100644
--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-# Copyright 2018 Mike Fährmann
+# Copyright 2018-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@@ -149,6 +149,8 @@ def test_names(self):
         def capitalize(c):
             if "-" in c:
                 return string.capwords(c.replace("-", " ")).replace(" ", "")
+            if "." in c:
+                c = c.replace(".", "")
             return c.capitalize()
 
         mapping = {