From 63ae5e391b56ea45a4e3960eb9795c6c9c9b1fac Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 6 Sep 2022 18:59:32 +0800 Subject: [PATCH 1/4] basic implementation of ResponseUrlProvider --- scrapy_poet/downloadermiddlewares.py | 2 + scrapy_poet/page_input_providers.py | 13 +++++- tests/test_middleware.py | 61 +++++++++++++++++++++++++++- 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/scrapy_poet/downloadermiddlewares.py b/scrapy_poet/downloadermiddlewares.py index a35cb3c4..ed303ec4 100644 --- a/scrapy_poet/downloadermiddlewares.py +++ b/scrapy_poet/downloadermiddlewares.py @@ -19,6 +19,7 @@ HttpResponseProvider, PageParamsProvider, RequestUrlProvider, + ResponseUrlProvider, ) logger = logging.getLogger(__name__) @@ -29,6 +30,7 @@ HttpClientProvider: 600, PageParamsProvider: 700, RequestUrlProvider: 800, + ResponseUrlProvider: 900, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 145ec9c4..e4ad49c8 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -23,6 +23,7 @@ HttpResponseHeaders, PageParams, RequestUrl, + ResponseUrl, ) from scrapy_poet.downloader import create_scrapy_downloader @@ -238,5 +239,15 @@ class RequestUrlProvider(PageObjectInputProvider): name = "request_url" def __call__(self, to_provide: Set[Callable], request: Request): - """Builds a ``RequestUrl`` instance using a Scrapy ``Request``""" + """Builds a ``RequestUrl`` instance using a Scrapy ``Request``.""" return [RequestUrl(url=request.url)] + + +class ResponseUrlProvider(PageObjectInputProvider): + + provided_classes = {ResponseUrl} + name = "response_url" + + def __call__(self, to_provide: Set[Callable], response: Response): + """Builds a ``ResponseUrl`` instance using a Scrapy ``Response``.""" + return [ResponseUrl(url=response.url)] diff --git a/tests/test_middleware.py b/tests/test_middleware.py index b8fe03ae..ba2dabeb 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -13,7 +13,7 @@ from twisted.internet.threads import deferToThread from url_matcher.util import get_domain from web_poet import default_registry -from web_poet.page_inputs import HttpResponse, RequestUrl +from web_poet.page_inputs import HttpResponse, RequestUrl, ResponseUrl from web_poet.pages import ItemPage, ItemWebPage, WebPage from scrapy_poet import DummyResponse, InjectionMiddleware, callback_for @@ -349,6 +349,65 @@ def test_skip_download_request_url(settings): assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 +class ResponseUrlSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, url: ResponseUrl): + return { + "response": response, + "url": url, + } + + +@inlineCallbacks +def test_skip_download_response_url(settings): + item, url, crawler = yield crawl_single_item( + ResponseUrlSpider, ProductHtml, settings + ) + assert isinstance(item["response"], Response) is True + # Even if the spider marked the response with DummyResponse, the response + # is still needed since ResponseUrl depends on it. + assert isinstance(item["response"], DummyResponse) is False + assert isinstance(item["url"], ResponseUrl) + assert str(item["url"]) == url + assert crawler.stats.get_stats().get("downloader/request_count", 0) == 1 + assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 0 + assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 + + +@attr.s(auto_attribs=True) +class ResponseUrlPage(ItemPage): + url: ResponseUrl + + def to_item(self): + return {'url': self.url} + + +class ResponseUrlPageSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, page: ResponseUrlPage): + return page.to_item() + + +@inlineCallbacks +def test_skip_download_response_url_page(settings): + item, url, crawler = yield crawl_single_item( + RequestUrlPageSpider, ProductHtml, settings) + assert tuple(item.keys()) == ('url',) + assert str(item['url']) == url + # Compared to the earlier test in test_skip_download_response_url(), the + # DummyResponse here worked since the response was handled in the PO. + assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 + assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 + + @attr.s(auto_attribs=True) class RequestUrlPage(ItemPage): url: RequestUrl From 46c3f5c2cecd800c7814e0f6bcade682e1e25c33 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 6 Sep 2022 19:37:22 +0800 Subject: [PATCH 2/4] fix incorrect test_skip_download_response_url_page test case --- tests/test_middleware.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index ba2dabeb..fe7ba3b6 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -383,7 +383,7 @@ class ResponseUrlPage(ItemPage): url: ResponseUrl def to_item(self): - return {'url': self.url} + return {"url": self.url} class ResponseUrlPageSpider(scrapy.Spider): @@ -399,13 +399,14 @@ def parse(self, response: DummyResponse, page: ResponseUrlPage): @inlineCallbacks def test_skip_download_response_url_page(settings): item, url, crawler = yield crawl_single_item( - RequestUrlPageSpider, ProductHtml, settings) - assert tuple(item.keys()) == ('url',) - assert str(item['url']) == url - # Compared to the earlier test in test_skip_download_response_url(), the - # DummyResponse here worked since the response was handled in the PO. - assert crawler.stats.get_stats().get('downloader/request_count', 0) == 0 - assert crawler.stats.get_stats().get('scrapy_poet/dummy_response_count', 0) == 1 + ResponseUrlPageSpider, ProductHtml, settings + ) + assert tuple(item.keys()) == ("url",) + assert str(item["url"]) == url + # Even if the spider marked the response with DummyResponse, the response + # is still needed since ResponseUrl depends on it. + assert crawler.stats.get_stats().get("downloader/request_count", 0) == 1 + assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 0 @attr.s(auto_attribs=True) From ed1847f5a7816c8af14a691c4b032cea04130c4f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 7 Sep 2022 13:52:03 +0800 Subject: [PATCH 3/4] testing ResponseUrlProvider should use WebPage instead of ItemPage --- tests/test_middleware.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index fe7ba3b6..b90c1b78 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -379,11 +379,11 @@ def test_skip_download_response_url(settings): @attr.s(auto_attribs=True) -class ResponseUrlPage(ItemPage): - url: ResponseUrl +class ResponseUrlPage(WebPage): + response_url: ResponseUrl def to_item(self): - return {"url": self.url} + return {"response_url": self.response_url} class ResponseUrlPageSpider(scrapy.Spider): @@ -401,8 +401,8 @@ def test_skip_download_response_url_page(settings): item, url, crawler = yield crawl_single_item( ResponseUrlPageSpider, ProductHtml, settings ) - assert tuple(item.keys()) == ("url",) - assert str(item["url"]) == url + assert tuple(item.keys()) == ("response_url",) + assert str(item["response_url"]) == url # Even if the spider marked the response with DummyResponse, the response # is still needed since ResponseUrl depends on it. assert crawler.stats.get_stats().get("downloader/request_count", 0) == 1 From 405538195b8c1ed8490e4cd14caa384f93def452 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 7 Sep 2022 13:59:03 +0800 Subject: [PATCH 4/4] update CHANGELOG with the new ResponseUrlProvider --- CHANGELOG.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 161efe0d..43e01502 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,13 @@ Changelog ========= +TBR +--- + +* Provider for ``web_poet.ResponseUrl`` is added, which allows to access the + response URL in the page object. This triggers a download unlike the provider + for ``web_poet.RequestUrl``. + 0.5.1 (2022-07-28) ------------------