From e425243b1e01c90010f1fe28f32654996ac481be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 23 May 2017 11:48:00 +0200 Subject: [PATCH] [reddit] some small fixes - filter or complete some URLs - remove the 'nofollow:' scheme before printing URLs - (#15) --- gallery_dl/extractor/recursive.py | 6 +++--- gallery_dl/extractor/reddit.py | 9 +++++++-- gallery_dl/job.py | 11 +++++++++-- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py index 358b4dabe5..01c54e2c21 100644 --- a/gallery_dl/extractor/recursive.py +++ b/gallery_dl/extractor/recursive.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015, 2016 Mike Fährmann +# Copyright 2015-2017 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,9 +14,9 @@ class RecursiveExtractor(Extractor): - + """Extractor that fetches URLs from a remote or local source""" category = "recursive" - pattern = ["r(?:ecursive)?:(.+)"] + pattern = [r"r(?:ecursive)?:(.+)"] test = [("recursive:https://pastebin.com/raw/FLwrCYsT", { "url": "eee86d65c346361b818e8f4b2b307d9429f136a2", })] diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index c48b85411a..c1c387d623 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -34,7 +34,11 @@ def items(self): ) ) for url in urls: - if regex.match(url): + if url[0] == "#": + continue + elif url[0] == "/": + url = "nofollow:https://www.reddit.com" + url + elif regex.match(url): url = "nofollow:" + url yield Message.Queue, url @@ -61,7 +65,8 @@ class RedditSubmissionExtractor(RedditExtractor): """Extractor for images from a submission on reddit.com""" subcategory = "subreddit" pattern = [(r"(?:https?://)?(?:m\.|www\.)?reddit\.com/r/[^/]+" - r"/comments/([^/]+)")] + r"/comments/([a-z0-9]+)"), + (r"(?:https?://)?redd\.it/([a-z0-9]+)")] def __init__(self, match): RedditExtractor.__init__(self) diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 9caa5f084a..16576cefb6 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -214,9 +214,10 @@ def __init__(self, url, depth=1): Job.__init__(self, url) self.depth = depth if depth == self.maxdepth: - self.handle_queue = print + self.handle_queue = self._print - def handle_url(self, url, _): + @staticmethod + def handle_url(url, _): print(url) def handle_queue(self, url): @@ -225,6 +226,12 @@ def handle_queue(self, url): except exception.NoExtractorError: pass + @staticmethod + def _print(url): + if url.startswith("nofollow:"): + url = url[9:] + print(url) + class TestJob(DownloadJob): """Generate test-results for extractor runs"""