From d8dea4c64cd998269d95c26982251ad9c068c68d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 6 Oct 2022 14:05:17 +0800 Subject: [PATCH 1/9] move away from the deprecated functionalities of web-poet==0.5.1 --- CHANGELOG.rst | 7 ++++ docs/intro/advanced-tutorial.rst | 4 +- docs/intro/basic-tutorial.rst | 38 +++++++++---------- docs/overrides.rst | 26 ++++++------- docs/providers.rst | 4 +- example/example/spiders/books_02.py | 4 +- example/example/spiders/books_02_1.py | 4 +- example/example/spiders/books_02_2.py | 4 +- example/example/spiders/books_02_3.py | 4 +- example/example/spiders/books_04.py | 4 +- .../example/spiders/books_04_overrides_01.py | 6 +-- .../example/spiders/books_04_overrides_02.py | 10 ++--- .../example/spiders/books_04_overrides_03.py | 6 +-- example/example/spiders/books_06.py | 4 +- scrapy_poet/api.py | 5 --- scrapy_poet/overrides.py | 26 ++++++------- setup.py | 2 +- tests/po_lib/__init__.py | 10 ++--- tests/test_callback_for.py | 19 ++-------- tests/test_downloader.py | 14 +++---- tests/test_injection.py | 4 +- tests/test_middleware.py | 14 +++---- tests/test_retries.py | 12 +++--- tests/test_scrapy_dependencies.py | 6 +-- tox.ini | 2 +- 25 files changed, 114 insertions(+), 125 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 43e01502..e5a8f644 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,13 @@ TBR * Provider for ``web_poet.ResponseUrl`` is added, which allows to access the response URL in the page object. This triggers a download unlike the provider for ``web_poet.RequestUrl``. +* Move from web-poet 0.5.0 to 0.6.0. + + * Updates all examples in the docs and tests from the deprecated + ``web_poet.ItemWebPage`` into ``web_poet.WebPage``. + * The Registry now uses ``web_poet.ApplyRule`` instead of + ``web_poet.OverrideRule``. + 0.5.1 (2022-07-28) ------------------ diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index b893d7ea..9662d240 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -48,7 +48,7 @@ Suppose we have the following Page Object: @attr.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http: web_poet.HttpClient async def to_item(self): @@ -110,7 +110,7 @@ This basically acts as a switch to update the behavior of the Page Object: @attr.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http: web_poet.HttpClient page_params: web_poet.PageParams diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 691f020c..8640f574 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -65,10 +65,10 @@ out of the spider class. .. code-block:: python - from web_poet.pages import ItemWebPage + from web_poet.pages import WebPage - class BookPage(ItemWebPage): + class BookPage(WebPage): """Individual book page on books.toscrape.com website, e.g. http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html """ @@ -93,10 +93,10 @@ extract a property from the ``to_item`` method: .. code-block:: python - from web_poet.pages import ItemWebPage + from web_poet.pages import WebPage - class BookPage(ItemWebPage): + class BookPage(WebPage): """Individual book page on books.toscrape.com website""" @property @@ -245,11 +245,11 @@ At the end of our job, the spider should look like this: .. code-block:: python import scrapy - from web_poet.pages import ItemWebPage + from web_poet.pages import WebPage from scrapy_poet import callback_for - class BookPage(ItemWebPage): + class BookPage(WebPage): """Individual book page on books.toscrape.com website, e.g. http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html """ @@ -353,7 +353,7 @@ existing Page Objects as subclasses of them: .. code-block:: python - from web_poet.pages import ItemWebPage, WebPage + from web_poet.pages import WebPage # ------ Base page objects ------ @@ -364,7 +364,7 @@ existing Page Objects as subclasses of them: return [] - class BookPage(ItemWebPage): + class BookPage(WebPage): def to_item(self): return None @@ -421,7 +421,7 @@ to implement new ones: .. code-block:: python - from web_poet.pages import ItemWebPage, WebPage + from web_poet.pages import WebPage class BPBookListPage(WebPage): @@ -430,7 +430,7 @@ to implement new ones: return self.css("article.post h4 a::attr(href)").getall() - class BPBookPage(ItemWebPage): + class BPBookPage(WebPage): def to_item(self): return { @@ -466,21 +466,21 @@ For example, the pattern ``books.toscrape.com/cataloge/category/`` is accepted and it would restrict the override only to category pages. It is even possible to configure more complex patterns by using the -:py:class:`web_poet.overrides.OverrideRule` class instead of a triplet in +:py:class:`web_poet.rules.ApplyRule` class instead of a triplet in the configuration. Another way of declaring the earlier config for ``SCRAPY_POET_OVERRIDES`` would be the following: .. code-block:: python from url_matcher import Patterns - from web_poet import OverrideRule + from web_poet import ApplyRule SCRAPY_POET_OVERRIDES = [ - OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), - OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage), - OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), - OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), + ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), + ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage), + ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), + ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), ] As you can see, this could get verbose. The earlier tuple config simply offers @@ -494,8 +494,8 @@ a shortcut to be more concise. Manually defining overrides like this would be inconvenient, most especially for larger projects. Fortunately, `web-poet`_ has a cool feature to annotate Page Objects like :py:func:`web_poet.handle_urls` that would define -and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the -:py:class:`web_poet.overrides.OverrideRule` rules could then be simply read as: +and store the :py:class:`web_poet.rules.ApplyRule` for you. All of the +:py:class:`web_poet.rules.ApplyRule` rules could then be simply read as: .. code:: python @@ -505,7 +505,7 @@ and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A", "another_ext_package.lib") - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + SCRAPY_POET_OVERRIDES = default_registry.get_rules() For more info on this, you can refer to these docs: diff --git a/docs/overrides.rst b/docs/overrides.rst index e278693d..6b3b5d96 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -15,10 +15,10 @@ page. - `Example 1 `_: rules using tuples - `Example 2 `_: - rules using tuples and :py:class:`web_poet.overrides.OverrideRule` + rules using tuples and :py:class:`web_poet.ApplyRule` - `Example 3 `_: rules using :py:func:`web_poet.handle_urls` decorator and retrieving them - via :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` + via :py:meth:`web_poet.PageObjectRegistry.get_rules` Page Objects refinement ======================= @@ -44,7 +44,7 @@ using the following Page Object: .. code-block:: python - class ISBNBookPage(ItemWebPage): + class ISBNBookPage(WebPage): def __init__(self, response: HttpResponse, book_page: BookPage): super().__init__(response) @@ -81,7 +81,7 @@ the obtained item with the ISBN from the page HTML. .. code-block:: python @attr.define - class ISBNBookPage(ItemWebPage): + class ISBNBookPage(WebPage): book_page: BookPage def to_item(self): @@ -95,17 +95,17 @@ Overrides rules The default way of configuring the override rules is using triplets of the form (``url pattern``, ``override_type``, ``overridden_type``). But more -complex rules can be introduced if the class :py:class:`web_poet.overrides.OverrideRule` +complex rules can be introduced if the class :py:class:`web_poet.ApplyRule` is used. The following example configures an override that is only applied for book pages from ``books.toscrape.com``: .. code-block:: python - from web_poet import OverrideRule + from web_poet import ApplyRule SCRAPY_POET_OVERRIDES = [ - OverrideRule( + ApplyRule( for_patterns=Patterns( include=["books.toscrape.com/cataloge/*index.html|"], exclude=["/catalogue/category/"]), @@ -155,7 +155,7 @@ for the domain ``toscrape.com``. In order to configure the ``scrapy-poet`` overrides automatically using these annotations, you can directly interact with `web-poet`_'s -``default_registry`` (an instance of :py:class:`web_poet.overrides.PageObjectRegistry`). +``default_registry`` (an instance of :py:class:`web_poet.PageObjectRegistry`). For example: @@ -169,15 +169,15 @@ For example: consume_modules("external_package_A", "another_ext_package.lib") # To get all of the Override Rules that were declared via annotations. - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + SCRAPY_POET_OVERRIDES = default_registry.get_rules() -The :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` method of the -``default_registry`` above returns ``List[OverrideRule]`` that were declared +The :py:meth:`web_poet.PageObjectRegistry.get_rules` method of the +``default_registry`` above returns ``List[ApplyRule]`` that were declared using `web-poet`_'s :py:func:`web_poet.handle_urls` annotation. This is much -more convenient that manually defining all of the :py:class:`web_poet.overrides.OverrideRule`. +more convenient that manually defining all of the :py:class:`web_poet.ApplyRule`. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as -``List[OverrideRule]``, you can easily modify it later on if needed. +``List[ApplyRule]``, you can easily modify it later on if needed. .. note:: diff --git a/docs/providers.rst b/docs/providers.rst index 0f581fc9..927a44e7 100644 --- a/docs/providers.rst +++ b/docs/providers.rst @@ -22,7 +22,7 @@ Creating providers Providers are responsible for building dependencies needed by Injectable objects. A good example would be the ``HttpResponseProvider``, which builds and provides a ``web_poet.HttpResponse`` instance for Injectables -that need it, like the ``web_poet.ItemWebPage``. +that need it, like the ``web_poet.WebPage``. .. code-block:: python @@ -271,7 +271,7 @@ Page Object uses it, the request is not ignored, for example: .. note:: The code above is just for example purposes. If you need to use ``Response`` - instances in your Page Objects, use built-in ``ItemWebPage`` - it has + instances in your Page Objects, use built-in ``WebPage`` - it has ``response`` attribute with ``HttpResponse``; no additional configuration is needed, as there is ``HttpResponseProvider`` enabled in ``scrapy-poet`` by default. diff --git a/example/example/spiders/books_02.py b/example/example/spiders/books_02.py index a1f52c34..06867076 100644 --- a/example/example/spiders/books_02.py +++ b/example/example/spiders/books_02.py @@ -3,10 +3,10 @@ BookPage is now independent of Scrapy. """ import scrapy -from web_poet import ItemWebPage +from web_poet import WebPage -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_02_1.py b/example/example/spiders/books_02_1.py index 42eac332..b6835512 100644 --- a/example/example/spiders/books_02_1.py +++ b/example/example/spiders/books_02_1.py @@ -4,12 +4,12 @@ boilerplate. """ import scrapy -from web_poet import ItemWebPage +from web_poet import WebPage from scrapy_poet import callback_for -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_02_2.py b/example/example/spiders/books_02_2.py index b9eccc41..a81960c5 100644 --- a/example/example/spiders/books_02_2.py +++ b/example/example/spiders/books_02_2.py @@ -11,12 +11,12 @@ it is better than defining callback explicitly. """ import scrapy -from web_poet import ItemWebPage +from web_poet import WebPage from scrapy_poet import callback_for -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_02_3.py b/example/example/spiders/books_02_3.py index 14cf53a3..66bc7e76 100644 --- a/example/example/spiders/books_02_3.py +++ b/example/example/spiders/books_02_3.py @@ -8,10 +8,10 @@ but it can be implemented, with Scrapy support. """ import scrapy -from web_poet import ItemWebPage +from web_poet import WebPage -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_04.py b/example/example/spiders/books_04.py index 2b94dbd7..08daf7fc 100644 --- a/example/example/spiders/books_04.py +++ b/example/example/spiders/books_04.py @@ -2,7 +2,7 @@ Scrapy spider which uses Page Objects both for crawling and extraction. """ import scrapy -from web_poet import ItemWebPage, WebPage +from web_poet import WebPage from scrapy_poet import callback_for @@ -12,7 +12,7 @@ def book_urls(self): return self.css(".image_container a::attr(href)").getall() -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py index 268c6e6d..6d27b0b7 100644 --- a/example/example/spiders/books_04_overrides_01.py +++ b/example/example/spiders/books_04_overrides_01.py @@ -6,7 +6,7 @@ The default configured PO logic contains the logic for books.toscrape.com """ import scrapy -from web_poet import ItemWebPage, WebPage +from web_poet import WebPage from scrapy_poet import callback_for @@ -18,7 +18,7 @@ def book_urls(self): return self.css(".image_container a::attr(href)").getall() -class BookPage(ItemWebPage): +class BookPage(WebPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" def to_item(self): @@ -35,7 +35,7 @@ def book_urls(self): return self.css("article.post h4 a::attr(href)").getall() -class BPBookPage(ItemWebPage): +class BPBookPage(WebPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" def to_item(self): diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index f707c2b2..78d6c80e 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -8,8 +8,8 @@ """ import scrapy from url_matcher import Patterns -from web_poet import ItemWebPage, WebPage -from web_poet.overrides import OverrideRule +from web_poet import WebPage +from web_poet.rules import ApplyRule from scrapy_poet import callback_for @@ -19,7 +19,7 @@ def book_urls(self): return [] -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return None @@ -67,12 +67,12 @@ class BooksSpider(scrapy.Spider): ("toscrape.com", BTSBookListPage, BookListPage), ("toscrape.com", BTSBookPage, BookPage), # We could also use the long-form version if we want to. - OverrideRule( + ApplyRule( for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage, ), - OverrideRule( + ApplyRule( for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage, diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index 525c75e6..db9a0dc6 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -11,7 +11,7 @@ store the rules in web-poet's registry. """ import scrapy -from web_poet import ItemWebPage, WebPage, default_registry, handle_urls +from web_poet import WebPage, default_registry, handle_urls from scrapy_poet import callback_for @@ -21,7 +21,7 @@ def book_urls(self): return [] -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return None @@ -68,7 +68,7 @@ class BooksSpider(scrapy.Spider): name = "books_04_overrides_03" start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages for different domains - custom_settings = {"SCRAPY_POET_OVERRIDES": default_registry.get_overrides()} + custom_settings = {"SCRAPY_POET_OVERRIDES": default_registry.get_rules()} def parse(self, response, page: BookListPage): yield from response.follow_all(page.book_urls(), callback_for(BookPage)) diff --git a/example/example/spiders/books_06.py b/example/example/spiders/books_06.py index 4ab91897..4668e5a2 100644 --- a/example/example/spiders/books_06.py +++ b/example/example/spiders/books_06.py @@ -12,7 +12,7 @@ import attr import scrapy -from web_poet import Injectable, ItemWebPage, WebPage +from web_poet import Injectable, WebPage class ListingsExtractor(WebPage): @@ -37,7 +37,7 @@ class ListingsPage(Injectable): @attr.s(auto_attribs=True) -class BookPage(ItemWebPage): +class BookPage(WebPage): breadcrumbs: BreadcrumbsExtractor def recently_viewed_urls(self): diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index fa2af323..a4eb15a9 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -107,11 +107,6 @@ def parse(self, response): if not issubclass(page_cls, ItemPage): raise TypeError(f"{page_cls.__name__} should be a subclass of ItemPage.") - if getattr(page_cls.to_item, "__isabstractmethod__", False): - raise NotImplementedError( - f"{page_cls.__name__} should implement to_item method." - ) - # When the callback is used as an instance method of the spider, it expects # to receive 'self' as its first argument. When used as a simple inline # function, it expects to receive a response as its first argument. diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index a404ca00..66c904d5 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -6,12 +6,12 @@ from scrapy import Request from scrapy.crawler import Crawler from url_matcher import Patterns, URLMatcher -from web_poet.overrides import OverrideRule +from web_poet.rules import ApplyRule logger = logging.getLogger(__name__) RuleAsTuple = Union[Tuple[str, Callable, Callable], List] -RuleFromUser = Union[RuleAsTuple, OverrideRule] +RuleFromUser = Union[RuleAsTuple, ApplyRule] class OverridesRegistryBase(ABC): @@ -29,7 +29,7 @@ class OverridesRegistry(OverridesRegistryBase): """ Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES`` in the spider settings. It is a list and each rule can be a tuple or an - instance of the class :py:class:`web_poet.overrides.OverrideRule`. + instance of the class :py:class:`web_poet.rules.ApplyRule`. If a tuple is provided: @@ -45,7 +45,7 @@ class OverridesRegistry(OverridesRegistryBase): .. code-block:: python from url_matcher import Patterns - from scrapy_poet.overrides import OverrideRule + from web_poet.rules import ApplyRule SCRAPY_POET_OVERRIDES = [ @@ -53,7 +53,7 @@ class OverridesRegistry(OverridesRegistryBase): ("books.toscrape.com", ISBNBookPage, BookPage), # Option 2 - OverrideRule( + ApplyRule( for_patterns=Patterns(["books.toscrape.com"]), use=MyBookListPage, instead_of=BookListPage, @@ -63,12 +63,12 @@ class OverridesRegistry(OverridesRegistryBase): .. _web-poet: https://web-poet.readthedocs.io Now, if you've used web-poet_'s built-in functionality to directly create - the :py:class:`web_poet.overrides.OverrideRule` in the Page Object via the + the :py:class:`web_poet.rules.ApplyRule` in the Page Object via the :py:func:`web_poet.handle_urls` annotation, you can quickly import them via the following code below. It finds all the rules annotated using web-poet_'s :py:func:`web_poet.handle_urls` as a decorator that were registered into ``web_poet.default_registry`` (an instance of - :py:class:`web_poet.overrides.PageObjectRegistry`). + :py:class:`web_poet.rules.PageObjectRegistry`). .. code-block:: python @@ -78,9 +78,9 @@ class OverridesRegistry(OverridesRegistryBase): # import rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A.po", "another_ext_package.lib") - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + SCRAPY_POET_OVERRIDES = default_registry.get_rules() - Make sure to call :py:func:`web_poet.overrides.consume_modules` beforehand. + Make sure to call :py:func:`web_poet.rules.consume_modules` beforehand. More info on this at web-poet_. """ @@ -89,22 +89,22 @@ def from_crawler(cls, crawler: Crawler) -> Crawler: return cls(crawler.settings.getlist("SCRAPY_POET_OVERRIDES", [])) def __init__(self, rules: Optional[Iterable[RuleFromUser]] = None) -> None: - self.rules: List[OverrideRule] = [] + self.rules: List[ApplyRule] = [] self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher) for rule in rules or []: self.add_rule(rule) - logger.debug(f"List of parsed OverrideRules:\n{self.rules}") + logger.debug(f"List of parsed ApplyRules:\n{self.rules}") def add_rule(self, rule: RuleFromUser) -> None: if isinstance(rule, (tuple, list)): if len(rule) != 3: raise ValueError( - f"Invalid overrides rule: {rule}. Rules as tuples must have " + f"Invalid rule: {rule}. Rules as tuples must have " f"3 elements: (1) the pattern, (2) the PO class used as a " f"replacement and (3) the PO class to be replaced." ) pattern, use, instead_of = rule - rule = OverrideRule( + rule = ApplyRule( for_patterns=Patterns([pattern]), use=use, instead_of=instead_of ) self.rules.append(rule) diff --git a/setup.py b/setup.py index ba573ce9..8856e6f8 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "sqlitedict >= 1.5.0", "twisted >= 18.9.0", "url-matcher >= 0.2.0", - "web-poet >= 0.4.0", + "web-poet @ git+https://git@github.com/scrapinghub/web-poet@handle_urls-with-item#egg=web-poet", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index e3db57e3..01baf584 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -4,7 +4,7 @@ import socket from url_matcher.util import get_domain -from web_poet import ItemWebPage, handle_urls +from web_poet import WebPage, handle_urls from tests.mockserver import get_ephemeral_port @@ -13,12 +13,12 @@ PORT = get_ephemeral_port() -class POOverriden(ItemWebPage): +class POOverriden(WebPage): def to_item(self): - return {"msg": "PO that will be replace"} + return {"msg": "PO that will be replaced"} -@handle_urls(f"{DOMAIN}:{PORT}", overrides=POOverriden) -class POIntegration(ItemWebPage): +@handle_urls(f"{DOMAIN}:{PORT}", instead_of=POOverriden) +class POIntegration(WebPage): def to_item(self): return {"msg": "PO replacement"} diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index 61805b4a..752afe75 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -1,7 +1,7 @@ import pytest import scrapy from pytest_twisted import ensureDeferred -from web_poet.pages import ItemPage, ItemWebPage +from web_poet.pages import ItemPage, WebPage from scrapy_poet import DummyResponse, callback_for @@ -16,7 +16,7 @@ async def to_item(self): return "fake item page" -class FakeItemWebPage(ItemWebPage): +class FakeWebPage(WebPage): def to_item(self): return "fake item web page" @@ -25,7 +25,7 @@ class MySpider(scrapy.Spider): name = "my_spider" parse_item = callback_for(FakeItemPage) - parse_web = callback_for(FakeItemWebPage) + parse_web = callback_for(FakeWebPage) class MySpiderAsync(scrapy.Spider): @@ -140,16 +140,3 @@ class MyClass(object): msg = "MyClass should be a subclass of ItemPage." assert str(exc.value) == msg - - -def test_not_implemented_method(): - """Classes should implement to_item method.""" - - class MyClass(ItemPage): - pass - - with pytest.raises(NotImplementedError) as exc: - callback_for(MyClass) - - msg = "MyClass should implement to_item method." - assert str(exc.value) == msg diff --git a/tests/test_downloader.py b/tests/test_downloader.py index d0c76e08..adc36954 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -11,7 +11,7 @@ from scrapy.exceptions import IgnoreRequest from web_poet import HttpClient from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError -from web_poet.pages import ItemWebPage +from web_poet.pages import WebPage from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.utils import http_request_to_scrapy_request @@ -133,7 +133,7 @@ def test_additional_requests_success(): with MockServer(EchoResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -170,7 +170,7 @@ def test_additional_requests_bad_response(): with MockServer(StatusResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -217,7 +217,7 @@ def test_additional_requests_connection_issue(): with MockServer(DelayedResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -256,7 +256,7 @@ def test_additional_requests_ignored_request(): with MockServer(EchoResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -313,7 +313,7 @@ def test_additional_requests_unhandled_downloader_middleware_exception(): with MockServer(EchoResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -367,7 +367,7 @@ def test_additional_requests_dont_filter(): with MockServer(EchoResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): diff --git a/tests/test_injection.py b/tests/test_injection.py index e2925de2..07ed895d 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -11,7 +11,7 @@ from url_matcher.util import get_domain from web_poet import Injectable, ItemPage from web_poet.mixins import ResponseShortcutsMixin -from web_poet.overrides import OverrideRule +from web_poet.rules import ApplyRule from scrapy_poet import ( CacheDataProviderMixin, @@ -325,7 +325,7 @@ def test_overrides(self, providers, override_should_happen): # when we configure them for domain other-example.com overrides = [ (domain, PriceInDollarsPO, PricePO), - OverrideRule( + ApplyRule( Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate ), ] diff --git a/tests/test_middleware.py b/tests/test_middleware.py index b90c1b78..f36873c7 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -14,7 +14,7 @@ from url_matcher.util import get_domain from web_poet import default_registry from web_poet.page_inputs import HttpResponse, RequestUrl, ResponseUrl -from web_poet.pages import ItemPage, ItemWebPage, WebPage +from web_poet.pages import ItemPage, WebPage from scrapy_poet import DummyResponse, InjectionMiddleware, callback_for from scrapy_poet.cache import SqlitedictCache @@ -64,7 +64,7 @@ def get(self): @attr.s(auto_attribs=True) -class ProductPage(ItemWebPage): +class ProductPage(WebPage): breadcrumbs: BreadcrumbsExtraction def to_item(self): @@ -118,7 +118,7 @@ def test_overrides(settings): @attr.s(auto_attribs=True) -class OptionalAndUnionPage(ItemWebPage): +class OptionalAndUnionPage(WebPage): breadcrumbs: BreadcrumbsExtraction opt_check_1: Optional[BreadcrumbsExtraction] opt_check_2: Optional[str] # str is not Injectable, so None expected here @@ -201,7 +201,7 @@ def __call__(self, to_provide): @attr.s(auto_attribs=True) -class ProvidedWithDeferredPage(ItemWebPage): +class ProvidedWithDeferredPage(WebPage): provided: ProvidedWithDeferred def to_item(self): @@ -472,7 +472,7 @@ def test_web_poet_integration(settings): from web_poet import default_registry - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + SCRAPY_POET_OVERRIDES = default_registry.get_rules() """ # Only import them in this test scope since they need to be synced with @@ -480,9 +480,9 @@ def test_web_poet_integration(settings): from tests.po_lib import PORT, POOverriden # Override rules are defined in `tests/po_lib/__init__.py`. - rules = default_registry.get_overrides() + rules = default_registry.get_rules() - # Converting it to a set removes potential duplicate OverrideRules + # Converting it to a set removes potential duplicate ApplyRules settings["SCRAPY_POET_OVERRIDES"] = set(rules) item, url, _ = yield crawl_single_item( diff --git a/tests/test_retries.py b/tests/test_retries.py index 2a72a90c..65b2d09f 100644 --- a/tests/test_retries.py +++ b/tests/test_retries.py @@ -3,7 +3,7 @@ from pytest_twisted import inlineCallbacks from scrapy import Spider from web_poet.exceptions import Retry -from web_poet.pages import ItemWebPage +from web_poet.pages import WebPage from tests.utils import EchoResource, MockServer, make_crawler @@ -28,7 +28,7 @@ def test_retry_once(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): if retries.popleft(): raise Retry @@ -58,7 +58,7 @@ def test_retry_max(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): if retries.popleft(): raise Retry @@ -86,7 +86,7 @@ def test_retry_exceeded(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): raise Retry @@ -113,7 +113,7 @@ def test_retry_max_configuration(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): if retries.popleft(): raise Retry @@ -146,7 +146,7 @@ def test_non_retry_exception(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): raise RuntimeError diff --git a/tests/test_scrapy_dependencies.py b/tests/test_scrapy_dependencies.py index 6e5db371..a82f077d 100644 --- a/tests/test_scrapy_dependencies.py +++ b/tests/test_scrapy_dependencies.py @@ -3,7 +3,7 @@ from pytest_twisted import inlineCallbacks from scrapy import Spider from scrapy.http import Request -from web_poet.pages import ItemWebPage +from web_poet.pages import WebPage from scrapy_poet.injection import SCRAPY_PROVIDED_CLASSES from scrapy_poet.page_input_providers import ( @@ -45,7 +45,7 @@ def __call__(self, to_provide, obj: scrapy_class): return [PageData(scrapy_class=scrapy_class.__name__)] @attr.s(auto_attribs=True) - class Page(ItemWebPage): + class Page(WebPage): page_data: PageData @@ -81,7 +81,7 @@ def test_scrapy_dependencies_on_page_objects(scrapy_class, settings): """Scrapy dependencies should not be injected into Page Objects.""" @attr.s(auto_attribs=True) - class Page(ItemWebPage): + class Page(WebPage): scrapy_obj: scrapy_class diff --git a/tox.ini b/tox.ini index bb992243..4e692a3e 100644 --- a/tox.ini +++ b/tox.ini @@ -27,7 +27,7 @@ deps = scrapy==2.6.0 sqlitedict==1.5.0 url-matcher==0.2.0 - web-poet==0.4.0 + web-poet @ git+https://git@github.com/scrapinghub/web-poet@handle_urls-with-item#egg=web-poet [testenv:asyncio-min] basepython = python3.7 From b7047a89a26f329ca4b061a3aa33c5d4f4338fa7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 6 Oct 2022 15:32:48 +0800 Subject: [PATCH 2/9] fix failing mypy tests --- scrapy_poet/overrides.py | 11 +++++++---- tests/test_middleware.py | 2 +- tox.ini | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index 66c904d5..a3387742 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -1,16 +1,18 @@ import logging from abc import ABC, abstractmethod from collections import defaultdict -from typing import Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from typing import Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Type, Union from scrapy import Request from scrapy.crawler import Crawler from url_matcher import Patterns, URLMatcher +from web_poet import ItemPage from web_poet.rules import ApplyRule logger = logging.getLogger(__name__) -RuleAsTuple = Union[Tuple[str, Callable, Callable], List] +PageObject = Type[ItemPage] +RuleAsTuple = Union[Tuple[str, PageObject, PageObject], List] RuleFromUser = Union[RuleAsTuple, ApplyRule] @@ -90,7 +92,7 @@ def from_crawler(cls, crawler: Crawler) -> Crawler: def __init__(self, rules: Optional[Iterable[RuleFromUser]] = None) -> None: self.rules: List[ApplyRule] = [] - self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher) + self.matcher: Dict[PageObject, URLMatcher] = defaultdict(URLMatcher) for rule in rules or []: self.add_rule(rule) logger.debug(f"List of parsed ApplyRules:\n{self.rules}") @@ -108,7 +110,8 @@ def add_rule(self, rule: RuleFromUser) -> None: for_patterns=Patterns([pattern]), use=use, instead_of=instead_of ) self.rules.append(rule) - self.matcher[rule.instead_of].add_or_update( + # FIXME: This key will change with the new rule.to_return + self.matcher[rule.instead_of].add_or_update( # type: ignore len(self.rules) - 1, rule.for_patterns ) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index f36873c7..4c40fdc9 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -149,7 +149,7 @@ def test_optional_and_unions(settings): @attr.s(auto_attribs=True) class ProvidedWithDeferred: msg: str - response: HttpResponse # it should be None because this class is provided + response: Optional[HttpResponse] # it should be None because this class is provided @attr.s(auto_attribs=True) diff --git a/tox.ini b/tox.ini index 4e692a3e..738d770a 100644 --- a/tox.ini +++ b/tox.ini @@ -38,7 +38,7 @@ deps = [testenv:mypy] deps = - mypy==0.790 + mypy==0.982 commands = mypy --ignore-missing-imports --no-warn-no-return scrapy_poet tests From 2d2ee3610363937ac07b3f60196cf09f9ce0c215 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 17 Oct 2022 17:59:06 +0800 Subject: [PATCH 3/9] remove to_item in some examples --- example/example/spiders/books_04_overrides_02.py | 3 +-- example/example/spiders/books_04_overrides_03.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index 78d6c80e..c98e41ff 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -20,8 +20,7 @@ def book_urls(self): class BookPage(WebPage): - def to_item(self): - return None + ... class BTSBookListPage(BookListPage): diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index db9a0dc6..237354cd 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -22,8 +22,7 @@ def book_urls(self): class BookPage(WebPage): - def to_item(self): - return None + ... @handle_urls("toscrape.com", overrides=BookListPage) From bfb237b434a840dfdedcc1e2c2d8c7c35e610b57 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 18 Oct 2022 13:52:52 +0800 Subject: [PATCH 4/9] Converter '...' to pass --- example/example/spiders/books_04_overrides_02.py | 2 +- example/example/spiders/books_04_overrides_03.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index c98e41ff..99dd3802 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -20,7 +20,7 @@ def book_urls(self): class BookPage(WebPage): - ... + pass class BTSBookListPage(BookListPage): diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index 237354cd..6d0020c2 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -22,7 +22,7 @@ def book_urls(self): class BookPage(WebPage): - ... + pass @handle_urls("toscrape.com", overrides=BookListPage) From c2b8fa16d7dcc35ea5a29a536a8578e913b85a25 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 21 Nov 2022 17:33:51 +0800 Subject: [PATCH 5/9] use new web-poet==0.6.0 --- CHANGELOG.rst | 2 +- setup.py | 2 +- tox.ini | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e5a8f644..2dab38f5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,7 +8,7 @@ TBR * Provider for ``web_poet.ResponseUrl`` is added, which allows to access the response URL in the page object. This triggers a download unlike the provider for ``web_poet.RequestUrl``. -* Move from web-poet 0.5.0 to 0.6.0. +* Now requires web-poet >= 0.6.0. * Updates all examples in the docs and tests from the deprecated ``web_poet.ItemWebPage`` into ``web_poet.WebPage``. diff --git a/setup.py b/setup.py index 8856e6f8..02c127e1 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "sqlitedict >= 1.5.0", "twisted >= 18.9.0", "url-matcher >= 0.2.0", - "web-poet @ git+https://git@github.com/scrapinghub/web-poet@handle_urls-with-item#egg=web-poet", + "web-poet >= 0.6.0", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tox.ini b/tox.ini index 738d770a..d2654859 100644 --- a/tox.ini +++ b/tox.ini @@ -27,7 +27,7 @@ deps = scrapy==2.6.0 sqlitedict==1.5.0 url-matcher==0.2.0 - web-poet @ git+https://git@github.com/scrapinghub/web-poet@handle_urls-with-item#egg=web-poet + web-poet==0.6.0 [testenv:asyncio-min] basepython = python3.7 From c952c736bb3d8619b714f96a47ad6a410512025b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 21 Nov 2022 17:45:07 +0800 Subject: [PATCH 6/9] fix failing tests --- docs/intro/basic-tutorial.rst | 2 +- docs/overrides.rst | 4 ++-- scrapy_poet/cache.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 8640f574..e026b928 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -512,7 +512,7 @@ For more info on this, you can refer to these docs: * ``scrapy-poet``'s :ref:`overrides` Tutorial section. * External `web-poet`_ docs. - * Specifically, the :external:ref:`intro-overrides` Tutorial section. + * Specifically, the :external:ref:`rules-intro` Tutorial section. Next steps ========== diff --git a/docs/overrides.rst b/docs/overrides.rst index 6b3b5d96..bc5ae3f7 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -129,7 +129,7 @@ along with where it is applied. This can be done by decorating the Page Objects with :py:func:`web_poet.handle_urls` provided by `web-poet`_. .. tip:: - Make sure to read the :external:ref:`intro-overrides` Tutorial section of + Make sure to read the :external:ref:`rules-intro` Tutorial section of `web-poet`_ to learn all of its other functionalities that is not covered in this section. @@ -183,7 +183,7 @@ Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as For more info and advanced features of `web-poet`_'s :py:func:`web_poet.handle_urls` and its registry, kindly read the `web-poet `_ - documentation, specifically its :external:ref:`intro-overrides` tutorial + documentation, specifically its :external:ref:`rules-intro` tutorial section. diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index dcae642b..eebc8897 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -16,7 +16,8 @@ def __getitem__(self, fingerprint: str) -> Any: def __setitem__(self, fingerprint: str, value) -> None: pass - def close(self): + @abc.abstractmethod + def close(self) -> None: pass From 04bcb9a0badf0bcea2a5facc11627f9218b11ac9 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 21 Nov 2022 19:15:46 +0800 Subject: [PATCH 7/9] updates after web-poet==0.6.0 has been released --- CHANGELOG.rst | 8 ++++++-- docs/overrides.rst | 8 ++++---- example/example/spiders/books_04_overrides_03.py | 8 ++++---- scrapy_poet/overrides.py | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2dab38f5..4df1a5f8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -10,8 +10,12 @@ TBR for ``web_poet.RequestUrl``. * Now requires web-poet >= 0.6.0. - * Updates all examples in the docs and tests from the deprecated - ``web_poet.ItemWebPage`` into ``web_poet.WebPage``. + * All examples in the docs and tests now use ``web_poet.WebPage`` + instead of ``web_poet.ItemWebPage``. + * The new ``instead_of`` parameter of the ``@handle_urls`` decorator + is now preferred instead of the deprecated ``overrides`` parameter. + * ``web_poet.callback_for`` doesn't require an implemented ``to_item`` + method anymore. * The Registry now uses ``web_poet.ApplyRule`` instead of ``web_poet.OverrideRule``. diff --git a/docs/overrides.rst b/docs/overrides.rst index bc5ae3f7..f9f759db 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -18,7 +18,7 @@ page. rules using tuples and :py:class:`web_poet.ApplyRule` - `Example 3 `_: rules using :py:func:`web_poet.handle_urls` decorator and retrieving them - via :py:meth:`web_poet.PageObjectRegistry.get_rules` + via :py:meth:`web_poet.rules.RulesRegistry.get_rules` Page Objects refinement ======================= @@ -140,7 +140,7 @@ Let's see an example: from web_poet import handle_urls - @handle_urls("toscrape.com", BookPage) + @handle_urls("toscrape.com", instead_of=BookPage) class BTSBookPage(BookPage): def to_item(self): @@ -155,7 +155,7 @@ for the domain ``toscrape.com``. In order to configure the ``scrapy-poet`` overrides automatically using these annotations, you can directly interact with `web-poet`_'s -``default_registry`` (an instance of :py:class:`web_poet.PageObjectRegistry`). +``default_registry`` (an instance of :py:class:`web_poet.rules.RulesRegistry`). For example: @@ -171,7 +171,7 @@ For example: # To get all of the Override Rules that were declared via annotations. SCRAPY_POET_OVERRIDES = default_registry.get_rules() -The :py:meth:`web_poet.PageObjectRegistry.get_rules` method of the +The :py:meth:`web_poet.rules.RulesRegistry.get_rules` method of the ``default_registry`` above returns ``List[ApplyRule]`` that were declared using `web-poet`_'s :py:func:`web_poet.handle_urls` annotation. This is much more convenient that manually defining all of the :py:class:`web_poet.ApplyRule`. diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index 6d0020c2..32c1f151 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -25,7 +25,7 @@ class BookPage(WebPage): pass -@handle_urls("toscrape.com", overrides=BookListPage) +@handle_urls("toscrape.com", instead_of=BookListPage) class BTSBookListPage(BookListPage): """Logic to extract listings from pages like https://books.toscrape.com""" @@ -33,7 +33,7 @@ def book_urls(self): return self.css(".image_container a::attr(href)").getall() -@handle_urls("toscrape.com", overrides=BookPage) +@handle_urls("toscrape.com", instead_of=BookPage) class BTSBookPage(BookPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" @@ -44,7 +44,7 @@ def to_item(self): } -@handle_urls("bookpage.com", overrides=BookListPage) +@handle_urls("bookpage.com", instead_of=BookListPage) class BPBookListPage(BookListPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" @@ -52,7 +52,7 @@ def book_urls(self): return self.css("article.post h4 a::attr(href)").getall() -@handle_urls("bookpage.com", overrides=BookPage) +@handle_urls("bookpage.com", instead_of=BookPage) class BPBookPage(BookPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index a3387742..f13e37da 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -70,7 +70,7 @@ class OverridesRegistry(OverridesRegistryBase): the following code below. It finds all the rules annotated using web-poet_'s :py:func:`web_poet.handle_urls` as a decorator that were registered into ``web_poet.default_registry`` (an instance of - :py:class:`web_poet.rules.PageObjectRegistry`). + :py:class:`web_poet.rules.RulesRegistry`). .. code-block:: python From 4a8807abbddc5f84f039a726ac22631d5b760d52 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 21 Nov 2022 19:58:48 +0800 Subject: [PATCH 8/9] revert turning close() into abstractmethod; avoid PageObject type alias --- scrapy_poet/cache.py | 3 +-- scrapy_poet/overrides.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index eebc8897..bb172b7d 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -16,8 +16,7 @@ def __getitem__(self, fingerprint: str) -> Any: def __setitem__(self, fingerprint: str, value) -> None: pass - @abc.abstractmethod - def close(self) -> None: + def close(self) -> None: # noqa: B027 pass diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index f13e37da..6aca416e 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -11,8 +11,7 @@ logger = logging.getLogger(__name__) -PageObject = Type[ItemPage] -RuleAsTuple = Union[Tuple[str, PageObject, PageObject], List] +RuleAsTuple = Union[Tuple[str, Type[ItemPage], Type[ItemPage]], List] RuleFromUser = Union[RuleAsTuple, ApplyRule] @@ -92,7 +91,7 @@ def from_crawler(cls, crawler: Crawler) -> Crawler: def __init__(self, rules: Optional[Iterable[RuleFromUser]] = None) -> None: self.rules: List[ApplyRule] = [] - self.matcher: Dict[PageObject, URLMatcher] = defaultdict(URLMatcher) + self.matcher: Dict[Type[ItemPage], URLMatcher] = defaultdict(URLMatcher) for rule in rules or []: self.add_rule(rule) logger.debug(f"List of parsed ApplyRules:\n{self.rules}") From 4881d6fa4b8163abd96cc1b97d9fde22f1066b9a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 21 Nov 2022 20:08:10 +0800 Subject: [PATCH 9/9] =?UTF-8?q?bump=20mypy=200.982=20=E2=86=92=200.991;=20?= =?UTF-8?q?fix=20typing=20issues?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pyproject.toml | 10 ++++++++++ scrapy_poet/injection.py | 2 +- tests/test_downloader.py | 27 ++++++++++++++------------- tests/test_scrapy_dependencies.py | 8 ++++---- tox.ini | 2 +- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cec60096..ec5bcbc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,13 @@ line-length = 88 [tool.isort] profile = "black" multi_line_output = 3 + +[[tool.mypy.overrides]] +module = [ + "tests.test_downloader.*", + "tests.test_scrapy_dependencies.*", +] +# Ignore this type of error since mypy expects an Iterable return +# when test cases are decorated with @inlineCallbacks. However, the +# tests doesn't return anything at all. +disable_error_code = "misc" diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index d44d445e..50f9ad8d 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -359,7 +359,7 @@ def is_provider_requiring_scrapy_response(provider): def get_injector_for_testing( providers: Mapping, - additional_settings: Dict = None, + additional_settings: Optional[Dict] = None, overrides_registry: Optional[OverridesRegistryBase] = None, ) -> Injector: """ diff --git a/tests/test_downloader.py b/tests/test_downloader.py index adc36954..da3c68af 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -1,4 +1,5 @@ from functools import partial +from typing import Callable from unittest import mock import attr @@ -26,13 +27,13 @@ @pytest.fixture -def scrapy_downloader(): +def scrapy_downloader() -> Callable: mock_downloader = AsyncMock() return create_scrapy_downloader(mock_downloader) @ensureDeferred -async def test_incompatible_scrapy_request(scrapy_downloader): +async def test_incompatible_scrapy_request(scrapy_downloader) -> None: """The Request must be web_poet.HttpRequest and not anything else.""" req = scrapy.Request("https://example.com") @@ -42,7 +43,7 @@ async def test_incompatible_scrapy_request(scrapy_downloader): @pytest.fixture -def fake_http_response(): +def fake_http_response() -> web_poet.HttpResponse: return web_poet.HttpResponse( "https://example.com", b"some content", @@ -52,7 +53,7 @@ def fake_http_response(): @ensureDeferred -async def test_scrapy_poet_downloader(fake_http_response): +async def test_scrapy_poet_downloader(fake_http_response) -> None: req = web_poet.HttpRequest("https://example.com") with mock.patch( @@ -77,7 +78,7 @@ async def test_scrapy_poet_downloader(fake_http_response): @ensureDeferred -async def test_scrapy_poet_downloader_ignored_request(): +async def test_scrapy_poet_downloader_ignored_request() -> None: """It should handle IgnoreRequest from Scrapy according to the web poet standard on additional request error handling.""" req = web_poet.HttpRequest("https://example.com") @@ -94,7 +95,7 @@ async def test_scrapy_poet_downloader_ignored_request(): @ensureDeferred -async def test_scrapy_poet_downloader_twisted_error(): +async def test_scrapy_poet_downloader_twisted_error() -> None: req = web_poet.HttpRequest("https://example.com") with mock.patch( @@ -109,7 +110,7 @@ async def test_scrapy_poet_downloader_twisted_error(): @ensureDeferred -async def test_scrapy_poet_downloader_head_redirect(fake_http_response): +async def test_scrapy_poet_downloader_head_redirect(fake_http_response) -> None: req = web_poet.HttpRequest("https://example.com", method="HEAD") with mock.patch( @@ -127,7 +128,7 @@ async def test_scrapy_poet_downloader_head_redirect(fake_http_response): @inlineCallbacks -def test_additional_requests_success(): +def test_additional_requests_success() -> None: items = [] with MockServer(EchoResource) as server: @@ -164,7 +165,7 @@ async def parse(self, response, page: ItemPage): @inlineCallbacks -def test_additional_requests_bad_response(): +def test_additional_requests_bad_response() -> None: items = [] with MockServer(StatusResource) as server: @@ -203,7 +204,7 @@ async def parse(self, response, page: ItemPage): @inlineCallbacks -def test_additional_requests_connection_issue(): +def test_additional_requests_connection_issue() -> None: items = [] with mock.patch( @@ -250,7 +251,7 @@ async def parse(self, response, page: ItemPage): @inlineCallbacks -def test_additional_requests_ignored_request(): +def test_additional_requests_ignored_request() -> None: items = [] with MockServer(EchoResource) as server: @@ -307,7 +308,7 @@ async def parse(self, response, page: ItemPage): strict=True, ) @inlineCallbacks -def test_additional_requests_unhandled_downloader_middleware_exception(): +def test_additional_requests_unhandled_downloader_middleware_exception() -> None: items = [] with MockServer(EchoResource) as server: @@ -353,7 +354,7 @@ async def parse(self, response, page: ItemPage): @inlineCallbacks -def test_additional_requests_dont_filter(): +def test_additional_requests_dont_filter() -> None: """Verify that while duplicate regular requests are filtered out, additional requests are not (neither relative to the main requests not relative to each other). diff --git a/tests/test_scrapy_dependencies.py b/tests/test_scrapy_dependencies.py index a82f077d..660c075a 100644 --- a/tests/test_scrapy_dependencies.py +++ b/tests/test_scrapy_dependencies.py @@ -30,7 +30,7 @@ class ProductHtml(HtmlResource): @inlineCallbacks @pytest.mark.parametrize("scrapy_class", SCRAPY_PROVIDED_CLASSES) -def test_scrapy_dependencies_on_providers(scrapy_class, settings): +def test_scrapy_dependencies_on_providers(scrapy_class, settings) -> None: """Scrapy dependencies should be injected into Providers.""" @attr.s(auto_attribs=True) @@ -41,7 +41,7 @@ class PageDataProvider(PageObjectInputProvider): provided_classes = {PageData} - def __call__(self, to_provide, obj: scrapy_class): + def __call__(self, to_provide, obj: scrapy_class): # type: ignore[valid-type] return [PageData(scrapy_class=scrapy_class.__name__)] @attr.s(auto_attribs=True) @@ -77,13 +77,13 @@ def parse(self, response, page: Page): @inlineCallbacks @pytest.mark.parametrize("scrapy_class", SCRAPY_PROVIDED_CLASSES) -def test_scrapy_dependencies_on_page_objects(scrapy_class, settings): +def test_scrapy_dependencies_on_page_objects(scrapy_class, settings) -> None: """Scrapy dependencies should not be injected into Page Objects.""" @attr.s(auto_attribs=True) class Page(WebPage): - scrapy_obj: scrapy_class + scrapy_obj: scrapy_class # type: ignore[valid-type] def to_item(self): return { diff --git a/tox.ini b/tox.ini index d2654859..a798b740 100644 --- a/tox.ini +++ b/tox.ini @@ -38,7 +38,7 @@ deps = [testenv:mypy] deps = - mypy==0.982 + mypy==0.991 commands = mypy --ignore-missing-imports --no-warn-no-return scrapy_poet tests