Add search downloading to twitter.py (#448)

Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL.
mikf · Oct 16, 2019 · bcddcca · bcddcca
1 parent 1693d97
commit bcddcca
Showing 1 changed file with 22 additions and 6 deletions.
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
@@ -140,18 +140,23 @@ def _tweets_from_api(self, url):
             if not data["has_more_items"]:
                 return
 
-            position = text.parse_int(text.extract(
-                tweet, 'data-tweet-id="', '"')[0])
-            if "max_position" in params and position >= params["max_position"]:
-                return
+            if "min_position" in data:
+                position = data["min_position"]
+                if "max_position" in params and position == params["max_position"]:
+                    return
+            else:
+                position = text.parse_int(text.extract(
+                    tweet, 'data-tweet-id="', '"')[0])
+                if "max_position" in params and position >= params["max_position"]:
+                    return
             params["max_position"] = position
 
 
 class TwitterTimelineExtractor(TwitterExtractor):
     """Extractor for all images from a user's timeline"""
     subcategory = "timeline"
     pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
-               r"/([^/?&#]+)/?(?:$|[?#])")
+               r"/((?!search)[^/?&#]+)/?(?:$|[?#])")
     test = (
         ("https://twitter.com/supernaturepics", {
             "range": "1-40",
@@ -171,7 +176,7 @@ class TwitterMediaExtractor(TwitterExtractor):
     """Extractor for all images from a user's Media Tweets"""
     subcategory = "media"
     pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
-               r"/([^/?&#]+)/media(?!\w)")
+               r"/((?!search)[^/?&#]+)/media(?!\w)")
     test = (
         ("https://twitter.com/supernaturepics/media", {
             "range": "1-40",
@@ -185,6 +190,17 @@ def tweets(self):
             self.root, self.user)
         return self._tweets_from_api(url)
 
+class TwitterSearchExtractor(TwitterExtractor):
+    """Extractor for all images from a search timeline"""
+    subcategory = "search"
+    pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+               r"/search[^q]+q=([^/?&#]+)(?:$|&)")
+    test = ()
+
+    def tweets(self):
+        url = "{}/i/search/timeline?f=tweets&q={}".format(
+            self.root, self.user)
+        return self._tweets_from_api(url)
 
 class TwitterTweetExtractor(TwitterExtractor):
     """Extractor for images from individual tweets"""