[unsplash] add extractors (#1197)

for - single photos (/photos/ID) - user profiles (/@user) - user likes (/@USER/likes) - search results (/s/photos/SEARCH)
mikf · Jan 19, 2021 · 534194b · 534194b
1 parent 1fc16cb
commit 534194b
Show file tree

Hide file tree

Showing 3 changed files with 186 additions and 0 deletions.
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
@@ -127,6 +127,7 @@ The /b/ Archive      https://thebarchive.com/            Boards, Search Results,
 Tsumino              https://www.tsumino.com/            Galleries, Search Results                          Supported
 Tumblr               https://www.tumblr.com/             Likes, Posts, Tag Searches, User Profiles          `OAuth <https://github.com/mikf/gallery-dl#oauth>`__
 Twitter              https://twitter.com/                |twitter-C|                                        Supported
+Unsplash             https://unsplash.com/               |unsplash-C|
 VSCO                 https://vsco.co/                    Collections, individual Images, User Profiles
 Wallhaven            https://wallhaven.cc/               individual Images, Search Results                  `API Key <configuration.rst#extractorwallhavenapi-key>`__
 Warosu               https://warosu.org/                 Threads
@@ -166,5 +167,6 @@ Turboimagehost       https://www.turboimagehost.com/     individual Images
 .. |reddit-C| replace:: individual Images, Submissions, Subreddits, User Profiles
 .. |smugmug-C| replace:: Albums, individual Images, Images from Users and Folders
 .. |twitter-C| replace:: Bookmarks, Likes, Lists, List Members, Media Timelines, Search Results, Timelines, Tweets
+.. |unsplash-C| replace:: Favorites, individual Images, Search Results, User Profiles
 .. |wikiart-C| replace:: Artists, Artist Listings, Artworks, individual Images
 .. |yuki-S| replace:: yuki.la 4chan archive
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -109,6 +109,7 @@
     "tsumino",
     "tumblr",
     "twitter",
+    "unsplash",
     "vanillarock",
     "vsco",
     "wallhaven",

diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://unsplash.com/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+BASE_PATTERN = r"(?:https?://)?unsplash\.com"
+
+
+class UnsplashExtractor(Extractor):
+    """Base class for unsplash extractors"""
+    category = "unsplash"
+    directory_fmt = ("{category}", "{user[username]}")
+    filename_fmt = "{id}.{extension}"
+    archive_fmt = "{id}"
+    root = "https://unsplash.com"
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.item = match.group(1)
+
+    def items(self):
+        for photo in self.photos():
+            util.delete_items(
+                photo, ("", "related_collections"))
+            url = photo["urls"]["raw"]
+            text.nameext_from_url(url, photo)
+
+            photo["extension"] = "jpg"
+            photo["date"] = text.parse_datetime(photo["created_at"])
+            if "tags" in photo:
+                photo["tags"] = [t["title"] for t in photo["tags"]]
+
+            yield Message.Directory, photo
+            yield Message.Url, url, photo
+
+    def _pagination(self, url, params, results=False):
+        params["per_page"] = "20"
+        params["page"] = 1
+
+        while True:
+            photos = self.request(url, params=params).json()
+            if results:
+                photos = photos["results"]
+            yield from photos
+
+            if len(photos) < 20:
+                return
+            params["page"] += 1
+
+
+class UnsplashImageExtractor(UnsplashExtractor):
+    """Extractor for a single unsplash photo"""
+    subcategory = "image"
+    pattern = BASE_PATTERN + r"/photos/(\w+)"
+    test = ("https://unsplash.com/photos/lsoogGC_5dg", {
+        "url": "00accb0a64d5a0df0db911f8b425892718dce524",
+        "keyword": {
+            "alt_description": "re:silhouette of trees near body of water ",
+            "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz",
+            "categories": list,
+            "color": "#f3c08c",
+            "created_at": "2020-04-08T08:29:42-04:00",
+            "date": "dt:2020-04-08 12:29:42",
+            "description": "The Island",
+            "downloads": int,
+            "exif": {
+                "aperture": "11",
+                "exposure_time": "30",
+                "focal_length": "70.0",
+                "iso": 200,
+                "make": "Canon",
+                "model": "Canon EOS 5D Mark IV"
+            },
+            "extension": "jpg",
+            "filename": "photo-1586348943529-beaae6c28db9",
+            "height": 6272,
+            "id": "lsoogGC_5dg",
+            "liked_by_user": False,
+            "likes": int,
+            "location": {
+                "city": "Beaver Dam",
+                "country": "United States",
+                "name": "Beaver Dam, WI 53916, USA",
+                "position": {
+                    "latitude": 43.457769,
+                    "longitude": -88.837329
+                },
+                "title": "Beaver Dam, WI 53916, USA"
+            },
+            "promoted_at": "2020-04-08T11:12:03-04:00",
+            "sponsorship": None,
+            "tags": list,
+            "updated_at": "2021-01-13T07:15:42-05:00",
+            "user": {
+                "accepted_tos": True,
+                "bio": str,
+                "first_name": "Dave",
+                "id": "uMJXuywXLiU",
+                "instagram_username": "just_midwest_rock",
+                "last_name": "Hoefler",
+                "location": "Madison, WI",
+                "name": "Dave Hoefler",
+                "portfolio_url": str,
+                "total_collections": 1,
+                "total_likes": 178,
+                "total_photos": 687,
+                "twitter_username": None,
+                "updated_at": "2021-01-13T21:50:35-05:00",
+                "username": "johnwestrock"
+            },
+            "views": int,
+            "width": 4480,
+        },
+    })
+
+    def photos(self):
+        url = "{}/napi/photos/{}".format(self.root, self.item)
+        return (self.request(url).json(),)
+
+
+class UnsplashUserExtractor(UnsplashExtractor):
+    """Extractor for all photos of an unsplash user"""
+    subcategory = "user"
+    pattern = BASE_PATTERN + r"/@(\w+)/?$"
+    test = ("https://unsplash.com/@johnwestrock", {
+        "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
+                   r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
+        "range": "1-30",
+        "count": 30,
+    })
+
+    def photos(self):
+        url = "{}/napi/users/{}/photos".format(self.root, self.item)
+        params = {"order_by": "latest"}
+        return self._pagination(url, params)
+
+
+class UnsplashFavoriteExtractor(UnsplashExtractor):
+    """Extractor for all likes of an unsplash user"""
+    subcategory = "favorite"
+    pattern = BASE_PATTERN + r"/@(\w+)/likes"
+    test = ("https://unsplash.com/@johnwestrock/likes", {
+        "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
+                   r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
+        "range": "1-30",
+        "count": 30,
+    })
+
+    def photos(self):
+        url = "{}/napi/users/{}/likes".format(self.root, self.item)
+        params = {"order_by": "latest"}
+        return self._pagination(url, params)
+
+
+class UnsplashSearchExtractor(UnsplashExtractor):
+    """Extractor for unsplash search results"""
+    subcategory = "search"
+    pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^/?#]+))?"
+    test = ("https://unsplash.com/s/photos/nature", {
+        "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
+                   r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
+        "range": "1-30",
+        "count": 30,
+    })
+
+    def __init__(self, match):
+        UnsplashExtractor.__init__(self, match)
+        self.query = match.group(2)
+
+    def photos(self):
+        url = self.root + "/napi/search/photos"
+        params = {"query": text.unquote(self.item)}
+        if self.query:
+            params.update(text.parse_query(self.query))
+        return self._pagination(url, params, True)