diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 43e01502..4df1a5f8 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,17 @@ TBR * Provider for ``web_poet.ResponseUrl`` is added, which allows to access the response URL in the page object. This triggers a download unlike the provider for ``web_poet.RequestUrl``. +* Now requires web-poet >= 0.6.0. + + * All examples in the docs and tests now use ``web_poet.WebPage`` + instead of ``web_poet.ItemWebPage``. + * The new ``instead_of`` parameter of the ``@handle_urls`` decorator + is now preferred instead of the deprecated ``overrides`` parameter. + * ``web_poet.callback_for`` doesn't require an implemented ``to_item`` + method anymore. + * The Registry now uses ``web_poet.ApplyRule`` instead of + ``web_poet.OverrideRule``. + 0.5.1 (2022-07-28) ------------------ diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst index b893d7ea..9662d240 100644 --- a/docs/intro/advanced-tutorial.rst +++ b/docs/intro/advanced-tutorial.rst @@ -48,7 +48,7 @@ Suppose we have the following Page Object: @attr.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http: web_poet.HttpClient async def to_item(self): @@ -110,7 +110,7 @@ This basically acts as a switch to update the behavior of the Page Object: @attr.define - class ProductPage(web_poet.ItemWebPage): + class ProductPage(web_poet.WebPage): http: web_poet.HttpClient page_params: web_poet.PageParams diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 691f020c..e026b928 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -65,10 +65,10 @@ out of the spider class. .. code-block:: python - from web_poet.pages import ItemWebPage + from web_poet.pages import WebPage - class BookPage(ItemWebPage): + class BookPage(WebPage): """Individual book page on books.toscrape.com website, e.g. http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html """ @@ -93,10 +93,10 @@ extract a property from the ``to_item`` method: .. code-block:: python - from web_poet.pages import ItemWebPage + from web_poet.pages import WebPage - class BookPage(ItemWebPage): + class BookPage(WebPage): """Individual book page on books.toscrape.com website""" @property @@ -245,11 +245,11 @@ At the end of our job, the spider should look like this: .. code-block:: python import scrapy - from web_poet.pages import ItemWebPage + from web_poet.pages import WebPage from scrapy_poet import callback_for - class BookPage(ItemWebPage): + class BookPage(WebPage): """Individual book page on books.toscrape.com website, e.g. http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html """ @@ -353,7 +353,7 @@ existing Page Objects as subclasses of them: .. code-block:: python - from web_poet.pages import ItemWebPage, WebPage + from web_poet.pages import WebPage # ------ Base page objects ------ @@ -364,7 +364,7 @@ existing Page Objects as subclasses of them: return [] - class BookPage(ItemWebPage): + class BookPage(WebPage): def to_item(self): return None @@ -421,7 +421,7 @@ to implement new ones: .. code-block:: python - from web_poet.pages import ItemWebPage, WebPage + from web_poet.pages import WebPage class BPBookListPage(WebPage): @@ -430,7 +430,7 @@ to implement new ones: return self.css("article.post h4 a::attr(href)").getall() - class BPBookPage(ItemWebPage): + class BPBookPage(WebPage): def to_item(self): return { @@ -466,21 +466,21 @@ For example, the pattern ``books.toscrape.com/cataloge/category/`` is accepted and it would restrict the override only to category pages. It is even possible to configure more complex patterns by using the -:py:class:`web_poet.overrides.OverrideRule` class instead of a triplet in +:py:class:`web_poet.rules.ApplyRule` class instead of a triplet in the configuration. Another way of declaring the earlier config for ``SCRAPY_POET_OVERRIDES`` would be the following: .. code-block:: python from url_matcher import Patterns - from web_poet import OverrideRule + from web_poet import ApplyRule SCRAPY_POET_OVERRIDES = [ - OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), - OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage), - OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), - OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), + ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), + ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage), + ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), + ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), ] As you can see, this could get verbose. The earlier tuple config simply offers @@ -494,8 +494,8 @@ a shortcut to be more concise. Manually defining overrides like this would be inconvenient, most especially for larger projects. Fortunately, `web-poet`_ has a cool feature to annotate Page Objects like :py:func:`web_poet.handle_urls` that would define -and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the -:py:class:`web_poet.overrides.OverrideRule` rules could then be simply read as: +and store the :py:class:`web_poet.rules.ApplyRule` for you. All of the +:py:class:`web_poet.rules.ApplyRule` rules could then be simply read as: .. code:: python @@ -505,14 +505,14 @@ and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A", "another_ext_package.lib") - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + SCRAPY_POET_OVERRIDES = default_registry.get_rules() For more info on this, you can refer to these docs: * ``scrapy-poet``'s :ref:`overrides` Tutorial section. * External `web-poet`_ docs. - * Specifically, the :external:ref:`intro-overrides` Tutorial section. + * Specifically, the :external:ref:`rules-intro` Tutorial section. Next steps ========== diff --git a/docs/overrides.rst b/docs/overrides.rst index e278693d..f9f759db 100644 --- a/docs/overrides.rst +++ b/docs/overrides.rst @@ -15,10 +15,10 @@ page. - `Example 1 `_: rules using tuples - `Example 2 `_: - rules using tuples and :py:class:`web_poet.overrides.OverrideRule` + rules using tuples and :py:class:`web_poet.ApplyRule` - `Example 3 `_: rules using :py:func:`web_poet.handle_urls` decorator and retrieving them - via :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` + via :py:meth:`web_poet.rules.RulesRegistry.get_rules` Page Objects refinement ======================= @@ -44,7 +44,7 @@ using the following Page Object: .. code-block:: python - class ISBNBookPage(ItemWebPage): + class ISBNBookPage(WebPage): def __init__(self, response: HttpResponse, book_page: BookPage): super().__init__(response) @@ -81,7 +81,7 @@ the obtained item with the ISBN from the page HTML. .. code-block:: python @attr.define - class ISBNBookPage(ItemWebPage): + class ISBNBookPage(WebPage): book_page: BookPage def to_item(self): @@ -95,17 +95,17 @@ Overrides rules The default way of configuring the override rules is using triplets of the form (``url pattern``, ``override_type``, ``overridden_type``). But more -complex rules can be introduced if the class :py:class:`web_poet.overrides.OverrideRule` +complex rules can be introduced if the class :py:class:`web_poet.ApplyRule` is used. The following example configures an override that is only applied for book pages from ``books.toscrape.com``: .. code-block:: python - from web_poet import OverrideRule + from web_poet import ApplyRule SCRAPY_POET_OVERRIDES = [ - OverrideRule( + ApplyRule( for_patterns=Patterns( include=["books.toscrape.com/cataloge/*index.html|"], exclude=["/catalogue/category/"]), @@ -129,7 +129,7 @@ along with where it is applied. This can be done by decorating the Page Objects with :py:func:`web_poet.handle_urls` provided by `web-poet`_. .. tip:: - Make sure to read the :external:ref:`intro-overrides` Tutorial section of + Make sure to read the :external:ref:`rules-intro` Tutorial section of `web-poet`_ to learn all of its other functionalities that is not covered in this section. @@ -140,7 +140,7 @@ Let's see an example: from web_poet import handle_urls - @handle_urls("toscrape.com", BookPage) + @handle_urls("toscrape.com", instead_of=BookPage) class BTSBookPage(BookPage): def to_item(self): @@ -155,7 +155,7 @@ for the domain ``toscrape.com``. In order to configure the ``scrapy-poet`` overrides automatically using these annotations, you can directly interact with `web-poet`_'s -``default_registry`` (an instance of :py:class:`web_poet.overrides.PageObjectRegistry`). +``default_registry`` (an instance of :py:class:`web_poet.rules.RulesRegistry`). For example: @@ -169,21 +169,21 @@ For example: consume_modules("external_package_A", "another_ext_package.lib") # To get all of the Override Rules that were declared via annotations. - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + SCRAPY_POET_OVERRIDES = default_registry.get_rules() -The :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` method of the -``default_registry`` above returns ``List[OverrideRule]`` that were declared +The :py:meth:`web_poet.rules.RulesRegistry.get_rules` method of the +``default_registry`` above returns ``List[ApplyRule]`` that were declared using `web-poet`_'s :py:func:`web_poet.handle_urls` annotation. This is much -more convenient that manually defining all of the :py:class:`web_poet.overrides.OverrideRule`. +more convenient that manually defining all of the :py:class:`web_poet.ApplyRule`. Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as -``List[OverrideRule]``, you can easily modify it later on if needed. +``List[ApplyRule]``, you can easily modify it later on if needed. .. note:: For more info and advanced features of `web-poet`_'s :py:func:`web_poet.handle_urls` and its registry, kindly read the `web-poet `_ - documentation, specifically its :external:ref:`intro-overrides` tutorial + documentation, specifically its :external:ref:`rules-intro` tutorial section. diff --git a/docs/providers.rst b/docs/providers.rst index 0f581fc9..927a44e7 100644 --- a/docs/providers.rst +++ b/docs/providers.rst @@ -22,7 +22,7 @@ Creating providers Providers are responsible for building dependencies needed by Injectable objects. A good example would be the ``HttpResponseProvider``, which builds and provides a ``web_poet.HttpResponse`` instance for Injectables -that need it, like the ``web_poet.ItemWebPage``. +that need it, like the ``web_poet.WebPage``. .. code-block:: python @@ -271,7 +271,7 @@ Page Object uses it, the request is not ignored, for example: .. note:: The code above is just for example purposes. If you need to use ``Response`` - instances in your Page Objects, use built-in ``ItemWebPage`` - it has + instances in your Page Objects, use built-in ``WebPage`` - it has ``response`` attribute with ``HttpResponse``; no additional configuration is needed, as there is ``HttpResponseProvider`` enabled in ``scrapy-poet`` by default. diff --git a/example/example/spiders/books_02.py b/example/example/spiders/books_02.py index a1f52c34..06867076 100644 --- a/example/example/spiders/books_02.py +++ b/example/example/spiders/books_02.py @@ -3,10 +3,10 @@ BookPage is now independent of Scrapy. """ import scrapy -from web_poet import ItemWebPage +from web_poet import WebPage -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_02_1.py b/example/example/spiders/books_02_1.py index 42eac332..b6835512 100644 --- a/example/example/spiders/books_02_1.py +++ b/example/example/spiders/books_02_1.py @@ -4,12 +4,12 @@ boilerplate. """ import scrapy -from web_poet import ItemWebPage +from web_poet import WebPage from scrapy_poet import callback_for -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_02_2.py b/example/example/spiders/books_02_2.py index b9eccc41..a81960c5 100644 --- a/example/example/spiders/books_02_2.py +++ b/example/example/spiders/books_02_2.py @@ -11,12 +11,12 @@ it is better than defining callback explicitly. """ import scrapy -from web_poet import ItemWebPage +from web_poet import WebPage from scrapy_poet import callback_for -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_02_3.py b/example/example/spiders/books_02_3.py index 14cf53a3..66bc7e76 100644 --- a/example/example/spiders/books_02_3.py +++ b/example/example/spiders/books_02_3.py @@ -8,10 +8,10 @@ but it can be implemented, with Scrapy support. """ import scrapy -from web_poet import ItemWebPage +from web_poet import WebPage -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_04.py b/example/example/spiders/books_04.py index 2b94dbd7..08daf7fc 100644 --- a/example/example/spiders/books_04.py +++ b/example/example/spiders/books_04.py @@ -2,7 +2,7 @@ Scrapy spider which uses Page Objects both for crawling and extraction. """ import scrapy -from web_poet import ItemWebPage, WebPage +from web_poet import WebPage from scrapy_poet import callback_for @@ -12,7 +12,7 @@ def book_urls(self): return self.css(".image_container a::attr(href)").getall() -class BookPage(ItemWebPage): +class BookPage(WebPage): def to_item(self): return { "url": self.url, diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py index 268c6e6d..6d27b0b7 100644 --- a/example/example/spiders/books_04_overrides_01.py +++ b/example/example/spiders/books_04_overrides_01.py @@ -6,7 +6,7 @@ The default configured PO logic contains the logic for books.toscrape.com """ import scrapy -from web_poet import ItemWebPage, WebPage +from web_poet import WebPage from scrapy_poet import callback_for @@ -18,7 +18,7 @@ def book_urls(self): return self.css(".image_container a::attr(href)").getall() -class BookPage(ItemWebPage): +class BookPage(WebPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" def to_item(self): @@ -35,7 +35,7 @@ def book_urls(self): return self.css("article.post h4 a::attr(href)").getall() -class BPBookPage(ItemWebPage): +class BPBookPage(WebPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" def to_item(self): diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index f707c2b2..99dd3802 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -8,8 +8,8 @@ """ import scrapy from url_matcher import Patterns -from web_poet import ItemWebPage, WebPage -from web_poet.overrides import OverrideRule +from web_poet import WebPage +from web_poet.rules import ApplyRule from scrapy_poet import callback_for @@ -19,9 +19,8 @@ def book_urls(self): return [] -class BookPage(ItemWebPage): - def to_item(self): - return None +class BookPage(WebPage): + pass class BTSBookListPage(BookListPage): @@ -67,12 +66,12 @@ class BooksSpider(scrapy.Spider): ("toscrape.com", BTSBookListPage, BookListPage), ("toscrape.com", BTSBookPage, BookPage), # We could also use the long-form version if we want to. - OverrideRule( + ApplyRule( for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage, ), - OverrideRule( + ApplyRule( for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage, diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index 525c75e6..32c1f151 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -11,7 +11,7 @@ store the rules in web-poet's registry. """ import scrapy -from web_poet import ItemWebPage, WebPage, default_registry, handle_urls +from web_poet import WebPage, default_registry, handle_urls from scrapy_poet import callback_for @@ -21,12 +21,11 @@ def book_urls(self): return [] -class BookPage(ItemWebPage): - def to_item(self): - return None +class BookPage(WebPage): + pass -@handle_urls("toscrape.com", overrides=BookListPage) +@handle_urls("toscrape.com", instead_of=BookListPage) class BTSBookListPage(BookListPage): """Logic to extract listings from pages like https://books.toscrape.com""" @@ -34,7 +33,7 @@ def book_urls(self): return self.css(".image_container a::attr(href)").getall() -@handle_urls("toscrape.com", overrides=BookPage) +@handle_urls("toscrape.com", instead_of=BookPage) class BTSBookPage(BookPage): """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html""" @@ -45,7 +44,7 @@ def to_item(self): } -@handle_urls("bookpage.com", overrides=BookListPage) +@handle_urls("bookpage.com", instead_of=BookListPage) class BPBookListPage(BookListPage): """Logic to extract listings from pages like https://bookpage.com/reviews""" @@ -53,7 +52,7 @@ def book_urls(self): return self.css("article.post h4 a::attr(href)").getall() -@handle_urls("bookpage.com", overrides=BookPage) +@handle_urls("bookpage.com", instead_of=BookPage) class BPBookPage(BookPage): """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction""" @@ -68,7 +67,7 @@ class BooksSpider(scrapy.Spider): name = "books_04_overrides_03" start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages for different domains - custom_settings = {"SCRAPY_POET_OVERRIDES": default_registry.get_overrides()} + custom_settings = {"SCRAPY_POET_OVERRIDES": default_registry.get_rules()} def parse(self, response, page: BookListPage): yield from response.follow_all(page.book_urls(), callback_for(BookPage)) diff --git a/example/example/spiders/books_06.py b/example/example/spiders/books_06.py index 4ab91897..4668e5a2 100644 --- a/example/example/spiders/books_06.py +++ b/example/example/spiders/books_06.py @@ -12,7 +12,7 @@ import attr import scrapy -from web_poet import Injectable, ItemWebPage, WebPage +from web_poet import Injectable, WebPage class ListingsExtractor(WebPage): @@ -37,7 +37,7 @@ class ListingsPage(Injectable): @attr.s(auto_attribs=True) -class BookPage(ItemWebPage): +class BookPage(WebPage): breadcrumbs: BreadcrumbsExtractor def recently_viewed_urls(self): diff --git a/pyproject.toml b/pyproject.toml index cec60096..ec5bcbc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,13 @@ line-length = 88 [tool.isort] profile = "black" multi_line_output = 3 + +[[tool.mypy.overrides]] +module = [ + "tests.test_downloader.*", + "tests.test_scrapy_dependencies.*", +] +# Ignore this type of error since mypy expects an Iterable return +# when test cases are decorated with @inlineCallbacks. However, the +# tests doesn't return anything at all. +disable_error_code = "misc" diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index fa2af323..a4eb15a9 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -107,11 +107,6 @@ def parse(self, response): if not issubclass(page_cls, ItemPage): raise TypeError(f"{page_cls.__name__} should be a subclass of ItemPage.") - if getattr(page_cls.to_item, "__isabstractmethod__", False): - raise NotImplementedError( - f"{page_cls.__name__} should implement to_item method." - ) - # When the callback is used as an instance method of the spider, it expects # to receive 'self' as its first argument. When used as a simple inline # function, it expects to receive a response as its first argument. diff --git a/scrapy_poet/cache.py b/scrapy_poet/cache.py index dcae642b..bb172b7d 100644 --- a/scrapy_poet/cache.py +++ b/scrapy_poet/cache.py @@ -16,7 +16,7 @@ def __getitem__(self, fingerprint: str) -> Any: def __setitem__(self, fingerprint: str, value) -> None: pass - def close(self): + def close(self) -> None: # noqa: B027 pass diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index d44d445e..50f9ad8d 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -359,7 +359,7 @@ def is_provider_requiring_scrapy_response(provider): def get_injector_for_testing( providers: Mapping, - additional_settings: Dict = None, + additional_settings: Optional[Dict] = None, overrides_registry: Optional[OverridesRegistryBase] = None, ) -> Injector: """ diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py index a404ca00..6aca416e 100644 --- a/scrapy_poet/overrides.py +++ b/scrapy_poet/overrides.py @@ -1,17 +1,18 @@ import logging from abc import ABC, abstractmethod from collections import defaultdict -from typing import Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from typing import Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Type, Union from scrapy import Request from scrapy.crawler import Crawler from url_matcher import Patterns, URLMatcher -from web_poet.overrides import OverrideRule +from web_poet import ItemPage +from web_poet.rules import ApplyRule logger = logging.getLogger(__name__) -RuleAsTuple = Union[Tuple[str, Callable, Callable], List] -RuleFromUser = Union[RuleAsTuple, OverrideRule] +RuleAsTuple = Union[Tuple[str, Type[ItemPage], Type[ItemPage]], List] +RuleFromUser = Union[RuleAsTuple, ApplyRule] class OverridesRegistryBase(ABC): @@ -29,7 +30,7 @@ class OverridesRegistry(OverridesRegistryBase): """ Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES`` in the spider settings. It is a list and each rule can be a tuple or an - instance of the class :py:class:`web_poet.overrides.OverrideRule`. + instance of the class :py:class:`web_poet.rules.ApplyRule`. If a tuple is provided: @@ -45,7 +46,7 @@ class OverridesRegistry(OverridesRegistryBase): .. code-block:: python from url_matcher import Patterns - from scrapy_poet.overrides import OverrideRule + from web_poet.rules import ApplyRule SCRAPY_POET_OVERRIDES = [ @@ -53,7 +54,7 @@ class OverridesRegistry(OverridesRegistryBase): ("books.toscrape.com", ISBNBookPage, BookPage), # Option 2 - OverrideRule( + ApplyRule( for_patterns=Patterns(["books.toscrape.com"]), use=MyBookListPage, instead_of=BookListPage, @@ -63,12 +64,12 @@ class OverridesRegistry(OverridesRegistryBase): .. _web-poet: https://web-poet.readthedocs.io Now, if you've used web-poet_'s built-in functionality to directly create - the :py:class:`web_poet.overrides.OverrideRule` in the Page Object via the + the :py:class:`web_poet.rules.ApplyRule` in the Page Object via the :py:func:`web_poet.handle_urls` annotation, you can quickly import them via the following code below. It finds all the rules annotated using web-poet_'s :py:func:`web_poet.handle_urls` as a decorator that were registered into ``web_poet.default_registry`` (an instance of - :py:class:`web_poet.overrides.PageObjectRegistry`). + :py:class:`web_poet.rules.RulesRegistry`). .. code-block:: python @@ -78,9 +79,9 @@ class OverridesRegistry(OverridesRegistryBase): # import rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A.po", "another_ext_package.lib") - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + SCRAPY_POET_OVERRIDES = default_registry.get_rules() - Make sure to call :py:func:`web_poet.overrides.consume_modules` beforehand. + Make sure to call :py:func:`web_poet.rules.consume_modules` beforehand. More info on this at web-poet_. """ @@ -89,26 +90,27 @@ def from_crawler(cls, crawler: Crawler) -> Crawler: return cls(crawler.settings.getlist("SCRAPY_POET_OVERRIDES", [])) def __init__(self, rules: Optional[Iterable[RuleFromUser]] = None) -> None: - self.rules: List[OverrideRule] = [] - self.matcher: Dict[Callable, URLMatcher] = defaultdict(URLMatcher) + self.rules: List[ApplyRule] = [] + self.matcher: Dict[Type[ItemPage], URLMatcher] = defaultdict(URLMatcher) for rule in rules or []: self.add_rule(rule) - logger.debug(f"List of parsed OverrideRules:\n{self.rules}") + logger.debug(f"List of parsed ApplyRules:\n{self.rules}") def add_rule(self, rule: RuleFromUser) -> None: if isinstance(rule, (tuple, list)): if len(rule) != 3: raise ValueError( - f"Invalid overrides rule: {rule}. Rules as tuples must have " + f"Invalid rule: {rule}. Rules as tuples must have " f"3 elements: (1) the pattern, (2) the PO class used as a " f"replacement and (3) the PO class to be replaced." ) pattern, use, instead_of = rule - rule = OverrideRule( + rule = ApplyRule( for_patterns=Patterns([pattern]), use=use, instead_of=instead_of ) self.rules.append(rule) - self.matcher[rule.instead_of].add_or_update( + # FIXME: This key will change with the new rule.to_return + self.matcher[rule.instead_of].add_or_update( # type: ignore len(self.rules) - 1, rule.for_patterns ) diff --git a/setup.py b/setup.py index ba573ce9..02c127e1 100755 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "sqlitedict >= 1.5.0", "twisted >= 18.9.0", "url-matcher >= 0.2.0", - "web-poet >= 0.4.0", + "web-poet >= 0.6.0", ], classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index e3db57e3..01baf584 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -4,7 +4,7 @@ import socket from url_matcher.util import get_domain -from web_poet import ItemWebPage, handle_urls +from web_poet import WebPage, handle_urls from tests.mockserver import get_ephemeral_port @@ -13,12 +13,12 @@ PORT = get_ephemeral_port() -class POOverriden(ItemWebPage): +class POOverriden(WebPage): def to_item(self): - return {"msg": "PO that will be replace"} + return {"msg": "PO that will be replaced"} -@handle_urls(f"{DOMAIN}:{PORT}", overrides=POOverriden) -class POIntegration(ItemWebPage): +@handle_urls(f"{DOMAIN}:{PORT}", instead_of=POOverriden) +class POIntegration(WebPage): def to_item(self): return {"msg": "PO replacement"} diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index 61805b4a..752afe75 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -1,7 +1,7 @@ import pytest import scrapy from pytest_twisted import ensureDeferred -from web_poet.pages import ItemPage, ItemWebPage +from web_poet.pages import ItemPage, WebPage from scrapy_poet import DummyResponse, callback_for @@ -16,7 +16,7 @@ async def to_item(self): return "fake item page" -class FakeItemWebPage(ItemWebPage): +class FakeWebPage(WebPage): def to_item(self): return "fake item web page" @@ -25,7 +25,7 @@ class MySpider(scrapy.Spider): name = "my_spider" parse_item = callback_for(FakeItemPage) - parse_web = callback_for(FakeItemWebPage) + parse_web = callback_for(FakeWebPage) class MySpiderAsync(scrapy.Spider): @@ -140,16 +140,3 @@ class MyClass(object): msg = "MyClass should be a subclass of ItemPage." assert str(exc.value) == msg - - -def test_not_implemented_method(): - """Classes should implement to_item method.""" - - class MyClass(ItemPage): - pass - - with pytest.raises(NotImplementedError) as exc: - callback_for(MyClass) - - msg = "MyClass should implement to_item method." - assert str(exc.value) == msg diff --git a/tests/test_downloader.py b/tests/test_downloader.py index d0c76e08..da3c68af 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -1,4 +1,5 @@ from functools import partial +from typing import Callable from unittest import mock import attr @@ -11,7 +12,7 @@ from scrapy.exceptions import IgnoreRequest from web_poet import HttpClient from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError -from web_poet.pages import ItemWebPage +from web_poet.pages import WebPage from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.utils import http_request_to_scrapy_request @@ -26,13 +27,13 @@ @pytest.fixture -def scrapy_downloader(): +def scrapy_downloader() -> Callable: mock_downloader = AsyncMock() return create_scrapy_downloader(mock_downloader) @ensureDeferred -async def test_incompatible_scrapy_request(scrapy_downloader): +async def test_incompatible_scrapy_request(scrapy_downloader) -> None: """The Request must be web_poet.HttpRequest and not anything else.""" req = scrapy.Request("https://example.com") @@ -42,7 +43,7 @@ async def test_incompatible_scrapy_request(scrapy_downloader): @pytest.fixture -def fake_http_response(): +def fake_http_response() -> web_poet.HttpResponse: return web_poet.HttpResponse( "https://example.com", b"some content", @@ -52,7 +53,7 @@ def fake_http_response(): @ensureDeferred -async def test_scrapy_poet_downloader(fake_http_response): +async def test_scrapy_poet_downloader(fake_http_response) -> None: req = web_poet.HttpRequest("https://example.com") with mock.patch( @@ -77,7 +78,7 @@ async def test_scrapy_poet_downloader(fake_http_response): @ensureDeferred -async def test_scrapy_poet_downloader_ignored_request(): +async def test_scrapy_poet_downloader_ignored_request() -> None: """It should handle IgnoreRequest from Scrapy according to the web poet standard on additional request error handling.""" req = web_poet.HttpRequest("https://example.com") @@ -94,7 +95,7 @@ async def test_scrapy_poet_downloader_ignored_request(): @ensureDeferred -async def test_scrapy_poet_downloader_twisted_error(): +async def test_scrapy_poet_downloader_twisted_error() -> None: req = web_poet.HttpRequest("https://example.com") with mock.patch( @@ -109,7 +110,7 @@ async def test_scrapy_poet_downloader_twisted_error(): @ensureDeferred -async def test_scrapy_poet_downloader_head_redirect(fake_http_response): +async def test_scrapy_poet_downloader_head_redirect(fake_http_response) -> None: req = web_poet.HttpRequest("https://example.com", method="HEAD") with mock.patch( @@ -127,13 +128,13 @@ async def test_scrapy_poet_downloader_head_redirect(fake_http_response): @inlineCallbacks -def test_additional_requests_success(): +def test_additional_requests_success() -> None: items = [] with MockServer(EchoResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -164,13 +165,13 @@ async def parse(self, response, page: ItemPage): @inlineCallbacks -def test_additional_requests_bad_response(): +def test_additional_requests_bad_response() -> None: items = [] with MockServer(StatusResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -203,7 +204,7 @@ async def parse(self, response, page: ItemPage): @inlineCallbacks -def test_additional_requests_connection_issue(): +def test_additional_requests_connection_issue() -> None: items = [] with mock.patch( @@ -217,7 +218,7 @@ def test_additional_requests_connection_issue(): with MockServer(DelayedResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -250,13 +251,13 @@ async def parse(self, response, page: ItemPage): @inlineCallbacks -def test_additional_requests_ignored_request(): +def test_additional_requests_ignored_request() -> None: items = [] with MockServer(EchoResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -307,13 +308,13 @@ async def parse(self, response, page: ItemPage): strict=True, ) @inlineCallbacks -def test_additional_requests_unhandled_downloader_middleware_exception(): +def test_additional_requests_unhandled_downloader_middleware_exception() -> None: items = [] with MockServer(EchoResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): @@ -353,7 +354,7 @@ async def parse(self, response, page: ItemPage): @inlineCallbacks -def test_additional_requests_dont_filter(): +def test_additional_requests_dont_filter() -> None: """Verify that while duplicate regular requests are filtered out, additional requests are not (neither relative to the main requests not relative to each other). @@ -367,7 +368,7 @@ def test_additional_requests_dont_filter(): with MockServer(EchoResource) as server: @attr.define - class ItemPage(ItemWebPage): + class ItemPage(WebPage): http: HttpClient async def to_item(self): diff --git a/tests/test_injection.py b/tests/test_injection.py index e2925de2..07ed895d 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -11,7 +11,7 @@ from url_matcher.util import get_domain from web_poet import Injectable, ItemPage from web_poet.mixins import ResponseShortcutsMixin -from web_poet.overrides import OverrideRule +from web_poet.rules import ApplyRule from scrapy_poet import ( CacheDataProviderMixin, @@ -325,7 +325,7 @@ def test_overrides(self, providers, override_should_happen): # when we configure them for domain other-example.com overrides = [ (domain, PriceInDollarsPO, PricePO), - OverrideRule( + ApplyRule( Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate ), ] diff --git a/tests/test_middleware.py b/tests/test_middleware.py index b90c1b78..4c40fdc9 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -14,7 +14,7 @@ from url_matcher.util import get_domain from web_poet import default_registry from web_poet.page_inputs import HttpResponse, RequestUrl, ResponseUrl -from web_poet.pages import ItemPage, ItemWebPage, WebPage +from web_poet.pages import ItemPage, WebPage from scrapy_poet import DummyResponse, InjectionMiddleware, callback_for from scrapy_poet.cache import SqlitedictCache @@ -64,7 +64,7 @@ def get(self): @attr.s(auto_attribs=True) -class ProductPage(ItemWebPage): +class ProductPage(WebPage): breadcrumbs: BreadcrumbsExtraction def to_item(self): @@ -118,7 +118,7 @@ def test_overrides(settings): @attr.s(auto_attribs=True) -class OptionalAndUnionPage(ItemWebPage): +class OptionalAndUnionPage(WebPage): breadcrumbs: BreadcrumbsExtraction opt_check_1: Optional[BreadcrumbsExtraction] opt_check_2: Optional[str] # str is not Injectable, so None expected here @@ -149,7 +149,7 @@ def test_optional_and_unions(settings): @attr.s(auto_attribs=True) class ProvidedWithDeferred: msg: str - response: HttpResponse # it should be None because this class is provided + response: Optional[HttpResponse] # it should be None because this class is provided @attr.s(auto_attribs=True) @@ -201,7 +201,7 @@ def __call__(self, to_provide): @attr.s(auto_attribs=True) -class ProvidedWithDeferredPage(ItemWebPage): +class ProvidedWithDeferredPage(WebPage): provided: ProvidedWithDeferred def to_item(self): @@ -472,7 +472,7 @@ def test_web_poet_integration(settings): from web_poet import default_registry - SCRAPY_POET_OVERRIDES = default_registry.get_overrides() + SCRAPY_POET_OVERRIDES = default_registry.get_rules() """ # Only import them in this test scope since they need to be synced with @@ -480,9 +480,9 @@ def test_web_poet_integration(settings): from tests.po_lib import PORT, POOverriden # Override rules are defined in `tests/po_lib/__init__.py`. - rules = default_registry.get_overrides() + rules = default_registry.get_rules() - # Converting it to a set removes potential duplicate OverrideRules + # Converting it to a set removes potential duplicate ApplyRules settings["SCRAPY_POET_OVERRIDES"] = set(rules) item, url, _ = yield crawl_single_item( diff --git a/tests/test_retries.py b/tests/test_retries.py index 2a72a90c..65b2d09f 100644 --- a/tests/test_retries.py +++ b/tests/test_retries.py @@ -3,7 +3,7 @@ from pytest_twisted import inlineCallbacks from scrapy import Spider from web_poet.exceptions import Retry -from web_poet.pages import ItemWebPage +from web_poet.pages import WebPage from tests.utils import EchoResource, MockServer, make_crawler @@ -28,7 +28,7 @@ def test_retry_once(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): if retries.popleft(): raise Retry @@ -58,7 +58,7 @@ def test_retry_max(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): if retries.popleft(): raise Retry @@ -86,7 +86,7 @@ def test_retry_exceeded(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): raise Retry @@ -113,7 +113,7 @@ def test_retry_max_configuration(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): if retries.popleft(): raise Retry @@ -146,7 +146,7 @@ def test_non_retry_exception(): with MockServer(EchoResource) as server: - class ItemPage(ItemWebPage): + class ItemPage(WebPage): def to_item(self): raise RuntimeError diff --git a/tests/test_scrapy_dependencies.py b/tests/test_scrapy_dependencies.py index 6e5db371..660c075a 100644 --- a/tests/test_scrapy_dependencies.py +++ b/tests/test_scrapy_dependencies.py @@ -3,7 +3,7 @@ from pytest_twisted import inlineCallbacks from scrapy import Spider from scrapy.http import Request -from web_poet.pages import ItemWebPage +from web_poet.pages import WebPage from scrapy_poet.injection import SCRAPY_PROVIDED_CLASSES from scrapy_poet.page_input_providers import ( @@ -30,7 +30,7 @@ class ProductHtml(HtmlResource): @inlineCallbacks @pytest.mark.parametrize("scrapy_class", SCRAPY_PROVIDED_CLASSES) -def test_scrapy_dependencies_on_providers(scrapy_class, settings): +def test_scrapy_dependencies_on_providers(scrapy_class, settings) -> None: """Scrapy dependencies should be injected into Providers.""" @attr.s(auto_attribs=True) @@ -41,11 +41,11 @@ class PageDataProvider(PageObjectInputProvider): provided_classes = {PageData} - def __call__(self, to_provide, obj: scrapy_class): + def __call__(self, to_provide, obj: scrapy_class): # type: ignore[valid-type] return [PageData(scrapy_class=scrapy_class.__name__)] @attr.s(auto_attribs=True) - class Page(ItemWebPage): + class Page(WebPage): page_data: PageData @@ -77,13 +77,13 @@ def parse(self, response, page: Page): @inlineCallbacks @pytest.mark.parametrize("scrapy_class", SCRAPY_PROVIDED_CLASSES) -def test_scrapy_dependencies_on_page_objects(scrapy_class, settings): +def test_scrapy_dependencies_on_page_objects(scrapy_class, settings) -> None: """Scrapy dependencies should not be injected into Page Objects.""" @attr.s(auto_attribs=True) - class Page(ItemWebPage): + class Page(WebPage): - scrapy_obj: scrapy_class + scrapy_obj: scrapy_class # type: ignore[valid-type] def to_item(self): return { diff --git a/tox.ini b/tox.ini index bb992243..a798b740 100644 --- a/tox.ini +++ b/tox.ini @@ -27,7 +27,7 @@ deps = scrapy==2.6.0 sqlitedict==1.5.0 url-matcher==0.2.0 - web-poet==0.4.0 + web-poet==0.6.0 [testenv:asyncio-min] basepython = python3.7 @@ -38,7 +38,7 @@ deps = [testenv:mypy] deps = - mypy==0.790 + mypy==0.991 commands = mypy --ignore-missing-imports --no-warn-no-return scrapy_poet tests