diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 161efe0d..43e01502 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,13 @@ Changelog ========= +TBR +--- + +* Provider for ``web_poet.ResponseUrl`` is added, which allows to access the + response URL in the page object. This triggers a download unlike the provider + for ``web_poet.RequestUrl``. + 0.5.1 (2022-07-28) ------------------ diff --git a/scrapy_poet/downloadermiddlewares.py b/scrapy_poet/downloadermiddlewares.py index a35cb3c4..ed303ec4 100644 --- a/scrapy_poet/downloadermiddlewares.py +++ b/scrapy_poet/downloadermiddlewares.py @@ -19,6 +19,7 @@ HttpResponseProvider, PageParamsProvider, RequestUrlProvider, + ResponseUrlProvider, ) logger = logging.getLogger(__name__) @@ -29,6 +30,7 @@ HttpClientProvider: 600, PageParamsProvider: 700, RequestUrlProvider: 800, + ResponseUrlProvider: 900, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 145ec9c4..e4ad49c8 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -23,6 +23,7 @@ HttpResponseHeaders, PageParams, RequestUrl, + ResponseUrl, ) from scrapy_poet.downloader import create_scrapy_downloader @@ -238,5 +239,15 @@ class RequestUrlProvider(PageObjectInputProvider): name = "request_url" def __call__(self, to_provide: Set[Callable], request: Request): - """Builds a ``RequestUrl`` instance using a Scrapy ``Request``""" + """Builds a ``RequestUrl`` instance using a Scrapy ``Request``.""" return [RequestUrl(url=request.url)] + + +class ResponseUrlProvider(PageObjectInputProvider): + + provided_classes = {ResponseUrl} + name = "response_url" + + def __call__(self, to_provide: Set[Callable], response: Response): + """Builds a ``ResponseUrl`` instance using a Scrapy ``Response``.""" + return [ResponseUrl(url=response.url)] diff --git a/tests/test_middleware.py b/tests/test_middleware.py index b8fe03ae..b90c1b78 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -13,7 +13,7 @@ from twisted.internet.threads import deferToThread from url_matcher.util import get_domain from web_poet import default_registry -from web_poet.page_inputs import HttpResponse, RequestUrl +from web_poet.page_inputs import HttpResponse, RequestUrl, ResponseUrl from web_poet.pages import ItemPage, ItemWebPage, WebPage from scrapy_poet import DummyResponse, InjectionMiddleware, callback_for @@ -349,6 +349,66 @@ def test_skip_download_request_url(settings): assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 +class ResponseUrlSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, url: ResponseUrl): + return { + "response": response, + "url": url, + } + + +@inlineCallbacks +def test_skip_download_response_url(settings): + item, url, crawler = yield crawl_single_item( + ResponseUrlSpider, ProductHtml, settings + ) + assert isinstance(item["response"], Response) is True + # Even if the spider marked the response with DummyResponse, the response + # is still needed since ResponseUrl depends on it. + assert isinstance(item["response"], DummyResponse) is False + assert isinstance(item["url"], ResponseUrl) + assert str(item["url"]) == url + assert crawler.stats.get_stats().get("downloader/request_count", 0) == 1 + assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 0 + assert crawler.stats.get_stats().get("downloader/response_count", 0) == 1 + + +@attr.s(auto_attribs=True) +class ResponseUrlPage(WebPage): + response_url: ResponseUrl + + def to_item(self): + return {"response_url": self.response_url} + + +class ResponseUrlPageSpider(scrapy.Spider): + url = None + + def start_requests(self): + yield Request(url=self.url, callback=self.parse) + + def parse(self, response: DummyResponse, page: ResponseUrlPage): + return page.to_item() + + +@inlineCallbacks +def test_skip_download_response_url_page(settings): + item, url, crawler = yield crawl_single_item( + ResponseUrlPageSpider, ProductHtml, settings + ) + assert tuple(item.keys()) == ("response_url",) + assert str(item["response_url"]) == url + # Even if the spider marked the response with DummyResponse, the response + # is still needed since ResponseUrl depends on it. + assert crawler.stats.get_stats().get("downloader/request_count", 0) == 1 + assert crawler.stats.get_stats().get("scrapy_poet/dummy_response_count", 0) == 0 + + @attr.s(auto_attribs=True) class RequestUrlPage(ItemPage): url: RequestUrl