From bcddcca6db680510392bdae9e59295740152fb6d Mon Sep 17 00:00:00 2001 From: Alice <38675581+alice945@users.noreply.github.com> Date: Wed, 16 Oct 2019 09:23:10 -0700 Subject: [PATCH] Add search downloading to twitter.py (#448) Adds the functionality to download search results on twitter.com/search. Since twitter only allows downloading of up to 3,200 of a users most recent tweets, you will be unable to download old images from users with a lot of tweets. To bypass this, you can use the twitter search to get the tweets from the sections in time you were stopped at. An example search would be "from:user since:2015-01-01 until:2016-01-01 filter:images". The URL you would use will look something like this https://twitter.com/search?f=tweets&q=from%3Asupernaturepics%20since%3A2015-01-01%20until%3A2016-01-01%20filter%3Aimages&src=typd&lang=en The _tweets_from_api function had to be changed because it would not get the next page of results using the last "data-tweet-id". It would return the same JSON but with a "min_position" string added. Using this string for the "max_position" param from the second page onwards correctly returned the next pages. This change does not interfere with how the other extractors work as far as I know. The 2 regex patterns in the extractors had to be changed to not match the search URL. --- gallery_dl/extractor/twitter.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 8105edeb80..4c9843bc9c 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -140,10 +140,15 @@ def _tweets_from_api(self, url): if not data["has_more_items"]: return - position = text.parse_int(text.extract( - tweet, 'data-tweet-id="', '"')[0]) - if "max_position" in params and position >= params["max_position"]: - return + if "min_position" in data: + position = data["min_position"] + if "max_position" in params and position == params["max_position"]: + return + else: + position = text.parse_int(text.extract( + tweet, 'data-tweet-id="', '"')[0]) + if "max_position" in params and position >= params["max_position"]: + return params["max_position"] = position @@ -151,7 +156,7 @@ class TwitterTimelineExtractor(TwitterExtractor): """Extractor for all images from a user's timeline""" subcategory = "timeline" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" - r"/([^/?&#]+)/?(?:$|[?#])") + r"/((?!search)[^/?&#]+)/?(?:$|[?#])") test = ( ("https://twitter.com/supernaturepics", { "range": "1-40", @@ -171,7 +176,7 @@ class TwitterMediaExtractor(TwitterExtractor): """Extractor for all images from a user's Media Tweets""" subcategory = "media" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" - r"/([^/?&#]+)/media(?!\w)") + r"/((?!search)[^/?&#]+)/media(?!\w)") test = ( ("https://twitter.com/supernaturepics/media", { "range": "1-40", @@ -185,6 +190,17 @@ def tweets(self): self.root, self.user) return self._tweets_from_api(url) +class TwitterSearchExtractor(TwitterExtractor): + """Extractor for all images from a search timeline""" + subcategory = "search" + pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" + r"/search[^q]+q=([^/?&#]+)(?:$|&)") + test = () + + def tweets(self): + url = "{}/i/search/timeline?f=tweets&q={}".format( + self.root, self.user) + return self._tweets_from_api(url) class TwitterTweetExtractor(TwitterExtractor): """Extractor for images from individual tweets"""