From feb98cf196a27e9425259f96d65a9bc9a5f6351c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 17 Jul 2019 15:35:42 +0200 Subject: [PATCH] [twitter] improve 'content' formatting; add option (#338) - include emoticons - leave newlines intact - remove pic.twitter.com/ links at the end --- docs/configuration.rst | 9 +++++++++ docs/gallery-dl.conf | 1 + gallery_dl/extractor/shopify.py | 2 +- gallery_dl/extractor/twitter.py | 33 ++++++++++++++++++++++++--------- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 95e47b4be1..3406baad6a 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -922,6 +922,15 @@ Description A (comma-separated) list of post types to extract images, etc. from. =========== ===== +extractor.twitter.content +------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Extract tweet text as ``content`` metadata. +=========== ===== + + extractor.twitter.retweets -------------------------- =========== ===== diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index d67b7fc7a1..6732028de7 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -132,6 +132,7 @@ }, "twitter": { + "content": false, "retweets": true, "videos": false }, diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 5925319ee1..b2498a0e07 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -107,7 +107,7 @@ def products(self): "pattern": r"(?:www\.)?fashionnova\.com", "test-product": ( ("https://www.fashionnova.com/products/essential-slide-red", { - "pattern": r"https?://cdn\.shopify.com/", + "pattern": r"https?://cdn\d*\.shopify.com/", "count": 3, }), ("https://www.fashionnova.com/collections/flats/products/name"), diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c206ec5338..ccba6406c6 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache +import re class TwitterExtractor(Extractor): @@ -26,8 +27,13 @@ def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) self.retweets = self.config("retweets", True) + self.content = self.config("content", False) self.videos = self.config("videos", False) + if self.content: + self._emoji_sub = re.compile( + r']*>').sub + def items(self): self.login() yield Message.Version, 1 @@ -88,10 +94,9 @@ def _login_impl(self, username, password): raise exception.AuthenticationError() return self.session.cookies - @staticmethod - def _data_from_tweet(tweet): + def _data_from_tweet(self, tweet): extr = text.extract_from(tweet) - return { + data = { "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), "retweeter" : extr('data-retweeter="' , '"'), @@ -99,10 +104,15 @@ def _data_from_tweet(tweet): "username" : extr('data-name="' , '"'), "user_id" : text.parse_int(extr('data-user-id="' , '"')), "date" : text.parse_timestamp(extr('data-time="', '"')), - "content" : text.unescape(text.remove_html(extr( - '
', '\n
' - ))).replace(" @ ", " @").replace(" # ", " #"), } + if self.content: + content = extr('
', '\n
') + if '