diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d97c446d..e62a0efd 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,128 @@ Changelog ========= +TBR +--- + +* Added support for item classes which are used as dependencies in page objects + and spider callbacks. The following is now possible: + + .. code-block:: python + + import attrs + import scrapy + from web_poet import WebPage, handle_urls, field + from scrapy_poet import DummyResponse + + @attrs.define + class Image: + url: str + + @handle_urls("example.com") + class ProductImagePage(WebPage[Image]): + @field + def url(self) -> str: + return self.css("#product img ::attr(href)").get("") + + @attrs.define + class Product: + name: str + image: Image + + @handle_urls("example.com") + @attrs.define + class ProductPage(WebPage[Product]): + # ✨ NEW: Notice that the page object can ask for items as dependencies. + # An instance of ``Image`` is injected behind the scenes by calling the + # ``.to_item()`` method of ``ProductImagePage``. + image_item: Image + + @field + def name(self) -> str: + return self.css("h1.name ::text").get("") + + @field + def image(self) -> Image: + return self.image_item + + class MySpider(scrapy.Spider): + name = "myspider" + + def start_requests(self): + yield scrapy.Request( + "https://example.com/products/some-product", self.parse + ) + + # ✨ NEW: Notice that we're directly using the item here and not the + # page object. + def parse(self, response: DummyResponse, item: Product): + return item + + + In line with this, the following new features were made: + + * Added a new :class:`scrapy_poet.page_input_providers.ItemProvider` which + makes the usage above possible. + + * An item class is now supported by :func:`scrapy_poet.callback_for` + alongside the usual page objects. This means that it won't raise a + :class:`TypeError` anymore when not passing a subclass of + :class:`web_poet.pages.ItemPage`. + + * New exception: :class:`scrapy_poet.injection_errors.ProviderDependencyDeadlockError`. + This is raised when it's not possible to create the dependencies due to + a deadlock in their sub-dependencies, e.g. due to a circular dependency + between page objects. + +* Moved some of the utility functions from the test module into + ``scrapy_poet.utils.testing``. + +* Documentation improvements. + +* Deprecations: + + * The ``SCRAPY_POET_OVERRIDES`` setting has been replaced by + ``SCRAPY_POET_RULES``. + +* Backward incompatible changes: + + * Overriding the default registry used via ``SCRAPY_POET_OVERRIDES_REGISTRY`` + is not possible anymore. + + * The following type aliases have been removed: + + * ``scrapy_poet.overrides.RuleAsTuple`` + * ``scrapy_poet.overrides.RuleFromUser`` + + * The :class:`scrapy_poet.page_input_providers.PageObjectInputProvider` base + class has these changes: + + * It now accepts an instance of :class:`scrapy_poet.injection.Injector` + in its constructor instead of :class:`scrapy.crawler.Crawler`. Although + you can still access the :class:`scrapy.crawler.Crawler` via the + ``Injector.crawler`` attribute. + + * :meth:`scrapy_poet.page_input_providers.PageObjectInputProvider.is_provided` + is now an instance method instead of a class method. + + * The :class:`scrapy_poet.injection.Injector`'s attribute and constructor + parameter called ``overrides_registry`` is now simply called ``registry``. + + * The ``scrapy_poet.overrides`` module which contained ``OverridesRegistryBase`` + and ``OverridesRegistry`` has now been removed. Instead, scrapy-poet directly + uses :class:`web_poet.rules.RulesRegistry`. + + Everything should pretty much the same except for + :meth:`web_poet.rules.RulesRegistry.overrides_for` now accepts :class:`str`, + :class:`web_poet.page_inputs.http.RequestUrl`, or + :class:`web_poet.page_inputs.http.ResponseUrl` instead of + :class:`scrapy.http.Request`. + + * This also means that the registry doesn't accept tuples as rules anymore. + Only :class:`web_poet.rules.ApplyRule` instances are allowed. The same goes + for ``SCRAPY_POET_RULES`` (and the deprecated ``SCRAPY_POET_OVERRIDES``). + + 0.8.0 (2023-01-24) ------------------ diff --git a/docs/api_reference.rst b/docs/api_reference.rst index fbdefd91..ce58b44e 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -14,7 +14,7 @@ API Injection Middleware ==================== -.. automodule:: scrapy_poet.middleware +.. automodule:: scrapy_poet.downloadermiddlewares :members: Page Input Providers @@ -43,9 +43,3 @@ Injection errors .. automodule:: scrapy_poet.injection_errors :members: - -Overrides -========= - -.. automodule:: scrapy_poet.overrides - :members: diff --git a/docs/index.rst b/docs/index.rst index a7083d88..7e8563f3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -43,7 +43,7 @@ To get started, see :ref:`intro-install` and :ref:`intro-tutorial`. :caption: Advanced :maxdepth: 1 - overrides + rules-from-web-poet providers testing diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst index 2324e9c8..fef4d164 100644 --- a/docs/intro/basic-tutorial.rst +++ b/docs/intro/basic-tutorial.rst @@ -414,17 +414,17 @@ The spider won't work anymore after the change. The reason is that it is using the new base Page Objects and they are empty. Let's fix it by instructing ``scrapy-poet`` to use the Books To Scrape (BTS) Page Objects for URLs belonging to the domain ``toscrape.com``. This must -be done by configuring ``SCRAPY_POET_OVERRIDES`` into ``settings.py``: +be done by configuring ``SCRAPY_POET_RULES`` into ``settings.py``: .. code-block:: python - "SCRAPY_POET_OVERRIDES": [ + "SCRAPY_POET_RULES": [ ("toscrape.com", BTSBookListPage, BookListPage), ("toscrape.com", BTSBookPage, BookPage) ] The spider is back to life! -``SCRAPY_POET_OVERRIDES`` contain rules that overrides the Page Objects +``SCRAPY_POET_RULES`` contain rules that overrides the Page Objects used for a particular domain. In this particular case, Page Objects ``BTSBookListPage`` and ``BTSBookPage`` will be used instead of ``BookListPage`` and ``BookPage`` for any request whose domain is @@ -465,16 +465,18 @@ to implement new ones: The last step is configuring the overrides so that these new Page Objects are used for the domain -``bookpage.com``. This is how ``SCRAPY_POET_OVERRIDES`` should look like into +``bookpage.com``. This is how ``SCRAPY_POET_RULES`` should look like into ``settings.py``: .. code-block:: python - "SCRAPY_POET_OVERRIDES": [ - ("toscrape.com", BTSBookListPage, BookListPage), - ("toscrape.com", BTSBookPage, BookPage), - ("bookpage.com", BPBookListPage, BookListPage), - ("bookpage.com", BPBookPage, BookPage) + from web_poet import ApplyRule + + "SCRAPY_POET_RULES": [ + ApplyRule("toscrape.com", use=BTSBookListPage, instead_of=BookListPage), + ApplyRule("toscrape.com", use=BTSBookPage, instead_of=BookPage), + ApplyRule("bookpage.com", use=BPBookListPage, instead_of=BookListPage), + ApplyRule("bookpage.com", use=BPBookPage, instead_of=BookPage) ] The spider is now ready to extract books from both sites 😀. @@ -490,27 +492,6 @@ for a particular domain, but more complex URL patterns are also possible. For example, the pattern ``books.toscrape.com/cataloge/category/`` is accepted and it would restrict the override only to category pages. -It is even possible to configure more complex patterns by using the -:py:class:`web_poet.rules.ApplyRule` class instead of a triplet in -the configuration. Another way of declaring the earlier config -for ``SCRAPY_POET_OVERRIDES`` would be the following: - -.. code-block:: python - - from url_matcher import Patterns - from web_poet import ApplyRule - - - SCRAPY_POET_OVERRIDES = [ - ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage), - ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage), - ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage), - ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage), - ] - -As you can see, this could get verbose. The earlier tuple config simply offers -a shortcut to be more concise. - .. note:: Also see the `url-matcher `_ @@ -530,11 +511,11 @@ and store the :py:class:`web_poet.rules.ApplyRule` for you. All of the # rules from other packages. Otherwise, it can be omitted. # More info about this caveat on web-poet docs. consume_modules("external_package_A", "another_ext_package.lib") - SCRAPY_POET_OVERRIDES = default_registry.get_rules() + SCRAPY_POET_RULES = default_registry.get_rules() For more info on this, you can refer to these docs: - * ``scrapy-poet``'s :ref:`overrides` Tutorial section. + * ``scrapy-poet``'s :ref:`rules-from-web-poet` Tutorial section. * External `web-poet`_ docs. * Specifically, the :external:ref:`rules-intro` Tutorial section. @@ -545,7 +526,8 @@ Next steps Now that you know how ``scrapy-poet`` is supposed to work, what about trying to apply it to an existing or new Scrapy project? -Also, please check the :ref:`overrides` and :ref:`providers` sections as well as -refer to spiders in the "example" folder: https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders +Also, please check the :ref:`rules-from-web-poet` and :ref:`providers` sections +as well as refer to spiders in the "example" folder: +https://github.com/scrapinghub/scrapy-poet/tree/master/example/example/spiders .. _Scrapy Tutorial: https://docs.scrapy.org/en/latest/intro/tutorial.html diff --git a/docs/overrides.rst b/docs/rules-from-web-poet.rst similarity index 59% rename from docs/overrides.rst rename to docs/rules-from-web-poet.rst index f9f759db..e9fcd8ab 100644 --- a/docs/overrides.rst +++ b/docs/rules-from-web-poet.rst @@ -1,6 +1,24 @@ -.. _overrides: +.. _rules-from-web-poet: + +=================== +Rules from web-poet +=================== + +scrapy-poet fully supports the functionalities of :class:`web_poet.rules.ApplyRule`. +It uses the registry from web_poet called :class:`web_poet.rules.RulesRegistry` +which provides functionalties for: + + * Returning the page object override if it exists for a given URL. + * Returning the page object capable of producing an item for a given URL. + +A list of :class:`web_poet.rules.ApplyRule` can be configured by passing it +to the ``SCRAPY_POET_RULES`` setting. + +In this section, we go over its ``instead_of`` parameter for overrides and +``to_return`` for item returns. However, please make sure you also read web-poet's +:ref:`rules-intro` tutorial to see all of the expected behaviors of the rules. + -========= Overrides ========= This functionality opens the door to configure specific Page Objects depending @@ -13,15 +31,16 @@ page. Some real-world examples on this topic can be found in: - `Example 1 `_: - rules using tuples + shorter example - `Example 2 `_: - rules using tuples and :py:class:`web_poet.ApplyRule` + longer example - `Example 3 `_: rules using :py:func:`web_poet.handle_urls` decorator and retrieving them via :py:meth:`web_poet.rules.RulesRegistry.get_rules` + Page Objects refinement -======================= +----------------------- Any ``Injectable`` or page input can be overridden. But the overriding mechanism stops for the children of any already overridden type. This opens @@ -59,8 +78,8 @@ And then override it for a particular domain using ``settings.py``: .. code-block:: python - SCRAPY_POET_OVERRIDES = [ - ("example.com", ISBNBookPage, BookPage) + SCRAPY_POET_RULES = [ + ApplyRule("example.com", use=ISBNBookPage, instead_of=BookPage) ] This new Page Object gets the original ``BookPage`` as dependency and enrich @@ -91,20 +110,17 @@ the obtained item with the ISBN from the page HTML. Overrides rules -=============== +--------------- -The default way of configuring the override rules is using triplets -of the form (``url pattern``, ``override_type``, ``overridden_type``). But more -complex rules can be introduced if the class :py:class:`web_poet.ApplyRule` -is used. The following example configures an override that is only applied for -book pages from ``books.toscrape.com``: +The following example configures an override that is only applied for book pages +from ``books.toscrape.com``: .. code-block:: python from web_poet import ApplyRule - SCRAPY_POET_OVERRIDES = [ + SCRAPY_POET_RULES = [ ApplyRule( for_patterns=Patterns( include=["books.toscrape.com/cataloge/*index.html|"], @@ -121,7 +137,7 @@ documentation. Decorate Page Objects with the rules -==================================== +------------------------------------ Having the rules along with the Page Objects is a good idea, as you can identify with a single sight what the Page Object is doing @@ -169,14 +185,14 @@ For example: consume_modules("external_package_A", "another_ext_package.lib") # To get all of the Override Rules that were declared via annotations. - SCRAPY_POET_OVERRIDES = default_registry.get_rules() + SCRAPY_POET_RULES = default_registry.get_rules() The :py:meth:`web_poet.rules.RulesRegistry.get_rules` method of the ``default_registry`` above returns ``List[ApplyRule]`` that were declared using `web-poet`_'s :py:func:`web_poet.handle_urls` annotation. This is much more convenient that manually defining all of the :py:class:`web_poet.ApplyRule`. -Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as +Take note that since ``SCRAPY_POET_RULES`` is structured as ``List[ApplyRule]``, you can easily modify it later on if needed. .. note:: @@ -187,21 +203,91 @@ Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as section. -Overrides registry -================== - -The overrides registry is responsible for informing whether there exists an -override for a particular type for a given request. The default overrides -registry allows to configure these rules using patterns that follow the -`url-matcher `_ syntax. These rules can be configured using the -``SCRAPY_POET_OVERRIDES`` setting, as it has been seen in the :ref:`intro-tutorial` -example. - -But the registry implementation can be changed at convenience. A different -registry implementation can be configured using the property -``SCRAPY_POET_OVERRIDES_REGISTRY`` in ``settings.py``. The new registry -must be a subclass of :class:`scrapy_poet.overrides.OverridesRegistryBase` and -must implement the method :meth:`scrapy_poet.overrides.OverridesRegistryBase.overrides_for`. -As other Scrapy components, it can be initialized from the ``from_crawler`` class -method if implemented. This might be handy to be able to access settings, stats, -request meta, etc. +Item Returns +============ + +scrapy-poet also supports a convenient way of asking for items directly. This +is made possible by the ``to_return`` parameter of :class:`web_poet.rules.ApplyRule`. +The ``to_return`` parameter specifies which item a page object is capable of +returning for a given URL. + +Let's check out an example: + +.. code-block:: python + + import attrs + import scrapy + from web_poet import WebPage, handle_urls, field + from scrapy_poet import DummyResponse + + + @attrs.define + class Product: + name: str + + + @handle_urls("example.com") + @attrs.define + class ProductPage(WebPage[Product]): + + @field + def name(self) -> str: + return self.css("h1.name ::text").get("") + + + class MySpider(scrapy.Spider): + name = "myspider" + + def start_requests(self): + yield scrapy.Request( + "https://example.com/products/some-product", self.parse + ) + + # We can directly ask for the item here instead of the page object. + def parse(self, response: DummyResponse, item: Product): + return item + +From this example, we can see that: + + * Spider callbacks can directly ask for items as dependencies. + * The ``Product`` item instance directly comes from ``ProductPage``. + * This is made possible by the ``ApplyRule("example.com", use=ProductPage, + to_return=Product)`` instance created from the ``@handle_urls`` decorator + on ``ProductPage``. + +.. note:: + + The slightly longer alternative way to do this is by declaring the page + object itself as the dependency and then calling its ``.to_item()`` method. + For example: + + .. code-block:: python + + @handle_urls("example.com") + @attrs.define + class ProductPage(WebPage[Product]): + product_image_page: ProductImagePage + + @field + def name(self) -> str: + return self.css("h1.name ::text").get("") + + @field + async def image(self) -> Image: + return await self.product_image_page.to_item() + + + class MySpider(scrapy.Spider): + name = "myspider" + + def start_requests(self): + yield scrapy.Request( + "https://example.com/products/some-product", self.parse + ) + + async def parse(self, response: DummyResponse, product_page: ProductPage): + return await product_page.to_item() + +For more information about all the expected behavior for the ``to_return`` +parameter in :class:`web_poet.rules.ApplyRule`, check out web-poet's tutorial +regarding :ref:`rules-item-class-example`. diff --git a/docs/settings.rst b/docs/settings.rst index c7993d52..67e1bdf9 100644 --- a/docs/settings.rst +++ b/docs/settings.rst @@ -21,26 +21,17 @@ More info on this at this section: :ref:`providers`. SCRAPY_POET_OVERRIDES --------------------- -Default: ``None`` - -Mapping of overrides for each domain. The format of the such ``dict`` mapping -depends on the currently set Registry. The default is currently -:class:`~.OverridesRegistry`. This can be overriden by the setting below: -``SCRAPY_POET_OVERRIDES_REGISTRY``. - -There are sections dedicated for this at :ref:`intro-tutorial` and :ref:`overrides`. +Deprecated. Use ``SCRAPY_POET_RULES`` instead. +SCRAPY_POET_RULES +----------------- -SCRAPY_POET_OVERRIDES_REGISTRY ------------------------------- - -Defaut: ``None`` +Default: ``None`` -Sets an alternative Registry to replace the default :class:`~.OverridesRegistry`. -To use this, set a ``str`` which denotes the absolute object path of the new -Registry. +Accepts an ``Iterable[ApplyRule]`` which sets the rules to use. -More info at :ref:`overrides`. +There are sections dedicated for this at :ref:`intro-tutorial` and +:ref:`rules-from-web-poet`. SCRAPY_POET_CACHE diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py index 0f701286..c59b80d9 100644 --- a/example/example/spiders/books_04_overrides_01.py +++ b/example/example/spiders/books_04_overrides_01.py @@ -6,7 +6,7 @@ The default configured PO logic contains the logic for books.toscrape.com """ import scrapy -from web_poet import WebPage +from web_poet import ApplyRule, WebPage from scrapy_poet import callback_for @@ -49,9 +49,9 @@ class BooksSpider(scrapy.Spider): name = "books_04_overrides_01" # Configuring different page objects pages from the bookpage.com domain custom_settings = { - "SCRAPY_POET_OVERRIDES": [ - ("bookpage.com", BPBookListPage, BookListPage), - ("bookpage.com", BPBookPage, BookPage), + "SCRAPY_POET_RULES": [ + ApplyRule("bookpage.com", use=BPBookListPage, instead_of=BookListPage), + ApplyRule("bookpage.com", use=BPBookPage, instead_of=BookPage), ] } diff --git a/example/example/spiders/books_04_overrides_02.py b/example/example/spiders/books_04_overrides_02.py index d612511d..7bd4cc0b 100644 --- a/example/example/spiders/books_04_overrides_02.py +++ b/example/example/spiders/books_04_overrides_02.py @@ -7,7 +7,6 @@ at all is applied. """ import scrapy -from url_matcher import Patterns from web_poet import WebPage from web_poet.rules import ApplyRule @@ -61,20 +60,11 @@ class BooksSpider(scrapy.Spider): name = "books_04_overrides_02" # Configuring different page objects pages for different domains custom_settings = { - "SCRAPY_POET_OVERRIDES": [ - ("toscrape.com", BTSBookListPage, BookListPage), - ("toscrape.com", BTSBookPage, BookPage), - # We could also use the long-form version if we want to. - ApplyRule( - for_patterns=Patterns(["bookpage.com"]), - use=BPBookListPage, - instead_of=BookListPage, - ), - ApplyRule( - for_patterns=Patterns(["bookpage.com"]), - use=BPBookPage, - instead_of=BookPage, - ), + "SCRAPY_POET_RULES": [ + ApplyRule("toscrape.com", use=BTSBookListPage, instead_of=BookListPage), + ApplyRule("toscrape.com", use=BTSBookPage, instead_of=BookPage), + ApplyRule("bookpage.com", use=BPBookListPage, instead_of=BookListPage), + ApplyRule("bookpage.com", use=BPBookPage, instead_of=BookPage), ] } diff --git a/example/example/spiders/books_04_overrides_03.py b/example/example/spiders/books_04_overrides_03.py index 6ee6498f..1b3c671d 100644 --- a/example/example/spiders/books_04_overrides_03.py +++ b/example/example/spiders/books_04_overrides_03.py @@ -67,7 +67,7 @@ class BooksSpider(scrapy.Spider): name = "books_04_overrides_03" start_urls = ["http://books.toscrape.com/", "https://bookpage.com/reviews"] # Configuring different page objects pages for different domains - custom_settings = {"SCRAPY_POET_OVERRIDES": default_registry.get_rules()} + custom_settings = {"SCRAPY_POET_RULES": default_registry.get_rules()} def start_requests(self): for url in ["http://books.toscrape.com/", "https://bookpage.com/reviews"]: diff --git a/pyproject.toml b/pyproject.toml index 53a42459..530b2f7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ multi_line_output = 3 module = [ "tests.test_cache.*", "tests.test_downloader.*", + "tests.test_web_poet_rules.*", "tests.test_scrapy_dependencies.*", ] # Ignore this type of error since mypy expects an Iterable return diff --git a/scrapy_poet/api.py b/scrapy_poet/api.py index a4eb15a9..ca2e6db0 100644 --- a/scrapy_poet/api.py +++ b/scrapy_poet/api.py @@ -29,8 +29,9 @@ def __init__(self, url: str, request=Optional[Request]): super().__init__(url=url, request=request) -def callback_for(page_cls: Type[ItemPage]) -> Callable: - """Create a callback for an :class:`web_poet.pages.ItemPage` subclass. +def callback_for(page_or_item_cls: Type) -> Callable: + """Create a callback for an :class:`web_poet.pages.ItemPage` subclass or an + item class. The generated callback returns the output of the ``ItemPage.to_item()`` method, i.e. extracts a single item @@ -104,24 +105,28 @@ def parse(self, response): disk queues, because in this case Scrapy is able to serialize your request object. """ - if not issubclass(page_cls, ItemPage): - raise TypeError(f"{page_cls.__name__} should be a subclass of ItemPage.") - # When the callback is used as an instance method of the spider, it expects # to receive 'self' as its first argument. When used as a simple inline # function, it expects to receive a response as its first argument. # # To avoid a TypeError, we need to receive a list of unnamed arguments and # a dict of named arguments after our injectable. - def parse(*args, page: page_cls, **kwargs): # type: ignore - yield page.to_item() # type: ignore + if issubclass(page_or_item_cls, ItemPage): + + def parse(*args, page: page_or_item_cls, **kwargs): # type: ignore + yield page.to_item() # type: ignore + + async def async_parse(*args, page: page_or_item_cls, **kwargs): # type: ignore + yield await page.to_item() # type: ignore + + if iscoroutinefunction(page_or_item_cls.to_item): + setattr(async_parse, _CALLBACK_FOR_MARKER, True) + return async_parse - async def async_parse(*args, page: page_cls, **kwargs): # type: ignore - yield await page.to_item() # type: ignore + else: - if iscoroutinefunction(page_cls.to_item): - setattr(async_parse, _CALLBACK_FOR_MARKER, True) - return async_parse + def parse(*args, item: page_or_item_cls, **kwargs): # type:ignore + yield item setattr(parse, _CALLBACK_FOR_MARKER, True) return parse diff --git a/scrapy_poet/commands.py b/scrapy_poet/commands.py index fd69cc54..3c47fb03 100644 --- a/scrapy_poet/commands.py +++ b/scrapy_poet/commands.py @@ -47,7 +47,7 @@ def __init__(self, crawler: Crawler) -> None: self.injector = SavingInjector( crawler, default_providers=DEFAULT_PROVIDERS, - overrides_registry=self.overrides_registry, + registry=self.registry, ) diff --git a/scrapy_poet/downloadermiddlewares.py b/scrapy_poet/downloadermiddlewares.py index 2059df42..bf9d08c8 100644 --- a/scrapy_poet/downloadermiddlewares.py +++ b/scrapy_poet/downloadermiddlewares.py @@ -10,19 +10,20 @@ from scrapy import Spider, signals from scrapy.crawler import Crawler from scrapy.http import Request, Response -from scrapy.utils.misc import create_instance, load_object from twisted.internet.defer import Deferred, inlineCallbacks +from web_poet import RulesRegistry from .api import DummyResponse from .injection import Injector -from .overrides import OverridesRegistry from .page_input_providers import ( HttpClientProvider, HttpResponseProvider, + ItemProvider, PageParamsProvider, RequestUrlProvider, ResponseUrlProvider, ) +from .utils import create_registry_instance logger = logging.getLogger(__name__) @@ -33,6 +34,7 @@ PageParamsProvider: 700, RequestUrlProvider: 800, ResponseUrlProvider: 900, + ItemProvider: 1000, } InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware") @@ -48,15 +50,11 @@ class InjectionMiddleware: def __init__(self, crawler: Crawler) -> None: """Initialize the middleware""" self.crawler = crawler - settings = self.crawler.settings - registry_cls = load_object( - settings.get("SCRAPY_POET_OVERRIDES_REGISTRY", OverridesRegistry) - ) - self.overrides_registry = create_instance(registry_cls, settings, crawler) + self.registry = create_registry_instance(RulesRegistry, crawler) self.injector = Injector( crawler, default_providers=DEFAULT_PROVIDERS, - overrides_registry=self.overrides_registry, + registry=self.registry, ) @classmethod diff --git a/scrapy_poet/injection.py b/scrapy_poet/injection.py index 12f77087..62b544d9 100644 --- a/scrapy_poet/injection.py +++ b/scrapy_poet/injection.py @@ -13,8 +13,9 @@ from scrapy.statscollectors import StatsCollector from scrapy.utils.conf import build_component_list from scrapy.utils.defer import maybeDeferred_coro -from scrapy.utils.misc import create_instance, load_object +from scrapy.utils.misc import load_object from twisted.internet.defer import inlineCallbacks +from web_poet import RulesRegistry from web_poet.pages import is_injectable from scrapy_poet.api import _CALLBACK_FOR_MARKER, DummyResponse @@ -24,10 +25,9 @@ NonCallableProviderError, UndeclaredProvidedTypeError, ) -from scrapy_poet.overrides import OverridesRegistry, OverridesRegistryBase from scrapy_poet.page_input_providers import PageObjectInputProvider -from .utils import get_scrapy_data_path +from .utils import create_registry_instance, get_scrapy_data_path logger = logging.getLogger(__name__) @@ -43,11 +43,11 @@ def __init__( crawler: Crawler, *, default_providers: Optional[Mapping] = None, - overrides_registry: Optional[OverridesRegistryBase] = None, + registry: Optional[RulesRegistry] = None, ): self.crawler = crawler self.spider = crawler.spider - self.overrides_registry = overrides_registry or OverridesRegistry() + self.registry = registry or RulesRegistry() self.load_providers(default_providers) self.init_cache() @@ -58,7 +58,7 @@ def load_providers(self, default_providers: Optional[Mapping] = None): # noqa: } provider_classes = build_component_list(providers_dict) logger.info(f"Loading providers:\n {pprint.pformat(provider_classes)}") - self.providers = [load_object(cls)(self.crawler) for cls in provider_classes] + self.providers = [load_object(cls)(self) for cls in provider_classes] check_all_providers_are_callable(self.providers) # Caching whether each provider requires the scrapy response self.is_provider_requiring_scrapy_response = { @@ -139,7 +139,10 @@ def build_plan(self, request: Request) -> andi.Plan: callback, is_injectable=is_injectable, externally_provided=self.is_class_provided_by_any_provider, - overrides=self.overrides_registry.overrides_for(request).get, + # Ignore the type since andi.plan expects overrides to be + # Callable[[Callable], Optional[Callable]] but the registry + # returns the typing for ``dict.get()`` method. + overrides=self.registry.overrides_for(request.url).get, # type: ignore[arg-type] ) @inlineCallbacks @@ -378,7 +381,7 @@ def is_provider_requiring_scrapy_response(provider): def get_injector_for_testing( providers: Mapping, additional_settings: Optional[Dict] = None, - overrides_registry: Optional[OverridesRegistryBase] = None, + registry: Optional[RulesRegistry] = None, ) -> Injector: """ Return an :class:`Injector` using a fake crawler. @@ -396,9 +399,9 @@ class MySpider(Spider): spider = MySpider() spider.settings = settings crawler.spider = spider - if not overrides_registry: - overrides_registry = create_instance(OverridesRegistry, settings, crawler) - return Injector(crawler, overrides_registry=overrides_registry) + if not registry: + registry = create_registry_instance(RulesRegistry, crawler) + return Injector(crawler, registry=registry) def get_response_for_testing(callback: Callable) -> Response: diff --git a/scrapy_poet/injection_errors.py b/scrapy_poet/injection_errors.py index 53169e33..01e4e926 100644 --- a/scrapy_poet/injection_errors.py +++ b/scrapy_poet/injection_errors.py @@ -12,3 +12,15 @@ class UndeclaredProvidedTypeError(InjectionError): class MalformedProvidedClassesError(InjectionError): pass + + +class ProviderDependencyDeadlockError(InjectionError): + """This is raised when it's not possible to create the dependencies due to + deadlock. + + For example: + - Page object named "ChickenPage" require "EggPage" as a dependency. + - Page object named "EggPage" require "ChickenPage" as a dependency. + """ + + pass diff --git a/scrapy_poet/overrides.py b/scrapy_poet/overrides.py deleted file mode 100644 index 6aca416e..00000000 --- a/scrapy_poet/overrides.py +++ /dev/null @@ -1,123 +0,0 @@ -import logging -from abc import ABC, abstractmethod -from collections import defaultdict -from typing import Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Type, Union - -from scrapy import Request -from scrapy.crawler import Crawler -from url_matcher import Patterns, URLMatcher -from web_poet import ItemPage -from web_poet.rules import ApplyRule - -logger = logging.getLogger(__name__) - -RuleAsTuple = Union[Tuple[str, Type[ItemPage], Type[ItemPage]], List] -RuleFromUser = Union[RuleAsTuple, ApplyRule] - - -class OverridesRegistryBase(ABC): - @abstractmethod - def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: - """ - Return a ``Mapping`` (e.g. a ``dict``) with type translation rules. - The key is the source type that is wanted to be replaced by - value, which is also a type. - """ - pass - - -class OverridesRegistry(OverridesRegistryBase): - """ - Overrides registry that reads the overrides from the ``SCRAPY_POET_OVERRIDES`` - in the spider settings. It is a list and each rule can be a tuple or an - instance of the class :py:class:`web_poet.rules.ApplyRule`. - - If a tuple is provided: - - - the **first** element is the pattern to match the URL, - - the **second** element is the type to be used instead of the type in - the **third** element. - - Another way to see it for the URLs that match the pattern ``tuple[0]`` use - ``tuple[1]`` instead of ``tuple[2]``. - - Example of overrides configuration: - - .. code-block:: python - - from url_matcher import Patterns - from web_poet.rules import ApplyRule - - - SCRAPY_POET_OVERRIDES = [ - # Option 1 - ("books.toscrape.com", ISBNBookPage, BookPage), - - # Option 2 - ApplyRule( - for_patterns=Patterns(["books.toscrape.com"]), - use=MyBookListPage, - instead_of=BookListPage, - ), - ] - - .. _web-poet: https://web-poet.readthedocs.io - - Now, if you've used web-poet_'s built-in functionality to directly create - the :py:class:`web_poet.rules.ApplyRule` in the Page Object via the - :py:func:`web_poet.handle_urls` annotation, you can quickly import them via - the following code below. It finds all the rules annotated using web-poet_'s - :py:func:`web_poet.handle_urls` as a decorator that were registered into - ``web_poet.default_registry`` (an instance of - :py:class:`web_poet.rules.RulesRegistry`). - - .. code-block:: python - - from web_poet import default_registry, consume_modules - - # The consume_modules() must be called first if you need to properly - # import rules from other packages. Otherwise, it can be omitted. - # More info about this caveat on web-poet docs. - consume_modules("external_package_A.po", "another_ext_package.lib") - SCRAPY_POET_OVERRIDES = default_registry.get_rules() - - Make sure to call :py:func:`web_poet.rules.consume_modules` beforehand. - More info on this at web-poet_. - """ - - @classmethod - def from_crawler(cls, crawler: Crawler) -> Crawler: - return cls(crawler.settings.getlist("SCRAPY_POET_OVERRIDES", [])) - - def __init__(self, rules: Optional[Iterable[RuleFromUser]] = None) -> None: - self.rules: List[ApplyRule] = [] - self.matcher: Dict[Type[ItemPage], URLMatcher] = defaultdict(URLMatcher) - for rule in rules or []: - self.add_rule(rule) - logger.debug(f"List of parsed ApplyRules:\n{self.rules}") - - def add_rule(self, rule: RuleFromUser) -> None: - if isinstance(rule, (tuple, list)): - if len(rule) != 3: - raise ValueError( - f"Invalid rule: {rule}. Rules as tuples must have " - f"3 elements: (1) the pattern, (2) the PO class used as a " - f"replacement and (3) the PO class to be replaced." - ) - pattern, use, instead_of = rule - rule = ApplyRule( - for_patterns=Patterns([pattern]), use=use, instead_of=instead_of - ) - self.rules.append(rule) - # FIXME: This key will change with the new rule.to_return - self.matcher[rule.instead_of].add_or_update( # type: ignore - len(self.rules) - 1, rule.for_patterns - ) - - def overrides_for(self, request: Request) -> Mapping[Callable, Callable]: - overrides: Dict[Callable, Callable] = {} - for instead_of, matcher in self.matcher.items(): - rule_id = matcher.match(request.url) - if rule_id is not None: - overrides[instead_of] = self.rules[rule_id].use - return overrides diff --git a/scrapy_poet/page_input_providers.py b/scrapy_poet/page_input_providers.py index 2d5ed883..eb003e0b 100644 --- a/scrapy_poet/page_input_providers.py +++ b/scrapy_poet/page_input_providers.py @@ -10,8 +10,24 @@ """ import abc import json -from typing import Any, Callable, ClassVar, Sequence, Set, Union +from dataclasses import make_dataclass +from inspect import isclass +from typing import ( + Any, + Callable, + ClassVar, + Dict, + List, + Optional, + Sequence, + Set, + Type, + Union, +) +from warnings import warn +from weakref import WeakKeyDictionary +import andi import attr from scrapy import Request from scrapy.crawler import Crawler @@ -24,9 +40,13 @@ RequestUrl, ResponseUrl, ) +from web_poet.pages import is_injectable from scrapy_poet.downloader import create_scrapy_downloader -from scrapy_poet.injection_errors import MalformedProvidedClassesError +from scrapy_poet.injection_errors import ( + MalformedProvidedClassesError, + ProviderDependencyDeadlockError, +) class PageObjectInputProvider: @@ -38,7 +58,7 @@ class PageObjectInputProvider: be declared in the class attribute ``provided_classes``. POIPs are initialized when the spider starts by invoking the ``__init__`` method, - which receives the crawler instance as argument. + which receives the ``scrapy_poet.injection.Injector`` instance as argument. The ``__call__`` method must be overridden, and it is inside this method where the actual instances must be build. The default ``__call__`` signature @@ -93,28 +113,28 @@ def __call__(self, to_provide, response: Response): is provided by this provider. """ - provided_classes: ClassVar[Union[Set[Callable], Callable[[Callable], bool]]] + provided_classes: Union[Set[Callable], Callable[[Callable], bool]] name: ClassVar[str] = "" # It must be a unique name. Used by the cache mechanism - @classmethod - def is_provided(cls, type_: Callable): + def is_provided(self, type_: Callable) -> bool: """ Return ``True`` if the given type is provided by this provider based on the value of the attribute ``provided_classes`` """ - if isinstance(cls.provided_classes, Set): - return type_ in cls.provided_classes - elif callable(cls.provided_classes): - return cls.provided_classes(type_) + if isinstance(self.provided_classes, Set): + return type_ in self.provided_classes + elif callable(self.provided_classes): + return self.provided_classes(type_) else: raise MalformedProvidedClassesError( f"Unexpected type {type_!r} for 'provided_classes' attribute of" - f"{cls!r}. Expected either 'set' or 'callable'" + f"{self!r}. Expected either 'set' or 'callable'" ) - def __init__(self, crawler: Crawler): + # FIXME: Can't import the Injector as class annotation due to circular dep. + def __init__(self, injector): """Initializes the provider. Invoked only at spider start up.""" - pass + self.injector = injector # Remember that is expected for all children to implement the ``__call__`` # method. The simplest signature for it is: @@ -263,3 +283,81 @@ class ResponseUrlProvider(PageObjectInputProvider): def __call__(self, to_provide: Set[Callable], response: Response): """Builds a ``ResponseUrl`` instance using a Scrapy ``Response``.""" return [ResponseUrl(url=response.url)] + + +class ItemProvider(PageObjectInputProvider): + + name = "item" + + def __init__(self, injector): + super().__init__(injector) + self.registry = self.injector.registry + + # The key that's used here is the ``scrapy.Request`` instance to ensure + # that the cached instances under it are properly garbage collected + # after processing such request. + self._cached_instances = WeakKeyDictionary() + + def provided_classes(self, cls): + """If the item is in any of the ``to_return`` in the rules, then it can + be provided by using the corresponding page object in ``use``. + """ + return isclass(cls) and self.registry.search(to_return=cls) + + def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None: + if request not in self._cached_instances: + self._cached_instances[request] = {} + self._cached_instances[request].update(mapping) + + def get_from_cache(self, request: Request, cls: Callable) -> Optional[Any]: + return self._cached_instances.get(request, {}).get(cls) + + async def __call__( + self, + to_provide: Set[Callable], + request: Request, + response: Response, + ) -> List[Any]: + results = [] + for cls in to_provide: + item = self.get_from_cache(request, cls) + if item: + results.append(item) + continue + + page_object_cls = self.registry.page_cls_for_item(request.url, cls) + if not page_object_cls: + warn( + f"Can't find appropriate page object for {cls} item for " + f"url: '{request.url}'. Check the ApplyRules you're using." + ) + continue + + # https://github.com/scrapinghub/andi/issues/23#issuecomment-1331682180 + fake_call_signature = make_dataclass( + "FakeCallSignature", [("page_object", page_object_cls)] + ) + plan = andi.plan( + fake_call_signature, + is_injectable=is_injectable, + externally_provided=self.injector.is_class_provided_by_any_provider, + ) + + try: + po_instances = await self.injector.build_instances( + request, response, plan + ) + except RecursionError: + raise ProviderDependencyDeadlockError( + f"Deadlock detected! A loop has been detected to trying to " + f"resolve this plan: {plan}" + ) + + page_object = po_instances[page_object_cls] + item = await page_object.to_item() + + self.update_cache(request, po_instances) + self.update_cache(request, {type(item): item}) + + results.append(item) + return results diff --git a/scrapy_poet/utils.py b/scrapy_poet/utils/__init__.py similarity index 72% rename from scrapy_poet/utils.py rename to scrapy_poet/utils/__init__.py index 5b38c534..88a2617c 100644 --- a/scrapy_poet/utils.py +++ b/scrapy_poet/utils/__init__.py @@ -1,5 +1,8 @@ import os +from typing import Type +from warnings import warn +from scrapy.crawler import Crawler from scrapy.http import Request, Response from scrapy.utils.project import inside_project, project_data_dir from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders @@ -43,3 +46,17 @@ def scrapy_response_to_http_response(response: Response) -> HttpResponse: headers=HttpResponseHeaders.from_bytes_dict(response.headers), **kwargs, ) + + +def create_registry_instance(cls: Type, crawler: Crawler): + if "SCRAPY_POET_OVERRIDES" in crawler.settings: + msg = ( + "The SCRAPY_POET_OVERRIDES setting is deprecated. " + "Use SCRAPY_POET_RULES instead." + ) + warn(msg, DeprecationWarning, stacklevel=2) + rules = crawler.settings.getlist( + "SCRAPY_POET_RULES", + crawler.settings.getlist("SCRAPY_POET_OVERRIDES", []), + ) + return cls(rules=rules) diff --git a/tests/mockserver.py b/scrapy_poet/utils/mockserver.py similarity index 97% rename from tests/mockserver.py rename to scrapy_poet/utils/mockserver.py index ec037bd0..39e795d1 100644 --- a/tests/mockserver.py +++ b/scrapy_poet/utils/mockserver.py @@ -29,7 +29,7 @@ def __enter__(self): sys.executable, "-u", "-m", - "tests.mockserver", + "scrapy_poet.utils.mockserver", self.resource, "--port", str(self.port), diff --git a/tests/utils.py b/scrapy_poet/utils/testing.py similarity index 70% rename from tests/utils.py rename to scrapy_poet/utils/testing.py index 8d039890..cf2a34a5 100644 --- a/tests/utils.py +++ b/scrapy_poet/utils/testing.py @@ -1,16 +1,19 @@ +from inspect import isasyncgenfunction from typing import Dict from unittest import mock -from pytest_twisted import inlineCallbacks +from scrapy import signals from scrapy.crawler import Crawler from scrapy.exceptions import CloseSpider +from scrapy.settings import Settings from scrapy.utils.python import to_bytes from twisted.internet import reactor +from twisted.internet.defer import inlineCallbacks from twisted.internet.task import deferLater from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET -from tests.mockserver import MockServer +from scrapy_poet.utils.mockserver import MockServer class HtmlResource(Resource): @@ -91,16 +94,19 @@ def crawl_single_item( spider_cls, resource_cls, settings, spider_kwargs=None, port=None ): """Run a spider where a single item is expected. Use in combination with - ``capture_capture_exceptions`` and ``CollectorPipeline`` + ``capture_exceptions`` and ``CollectorPipeline`` """ items, url, crawler = yield crawl_items( spider_cls, resource_cls, settings, spider_kwargs=spider_kwargs, port=port ) - assert len(items) == 1 - resp = items[0] - if "exception" in resp: - raise resp["exception"] - return resp, url, crawler + try: + item = items[0] + except IndexError: + return None, url, crawler + + if isinstance(item, dict) and "exception" in item: + raise item["exception"] + return item, url, crawler def make_crawler(spider_cls, settings): @@ -124,14 +130,50 @@ def process_item(self, item, spider): return item +class InjectedDependenciesCollectorMiddleware: + @classmethod + def from_crawler(cls, crawler): + obj = cls() + crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened) + return obj + + def spider_opened(self, spider): + spider.collected_response_deps = [] + + def process_response(self, request, response, spider): + spider.collected_response_deps.append(request.cb_kwargs) + return response + + +def create_scrapy_settings(request): + """Default scrapy-poet settings""" + s = dict( + # collect scraped items to crawler.spider.collected_items + ITEM_PIPELINES={ + CollectorPipeline: 100, + }, + DOWNLOADER_MIDDLEWARES={ + # collect injected dependencies to crawler.spider.collected_response_deps + InjectedDependenciesCollectorMiddleware: 542, + "scrapy_poet.InjectionMiddleware": 543, + }, + ) + return Settings(s) + + def capture_exceptions(callback): """Wrapper for Scrapy callbacks that captures exceptions within the provided callback and yields it under `exception` property. Also spider is closed on the first exception.""" - def parse(*args, **kwargs): + async def parse(*args, **kwargs): try: - yield from callback(*args, **kwargs) + if isasyncgenfunction(callback): + async for x in callback(*args, **kwargs): + yield x + else: + for x in callback(*args, **kwargs): + yield x except Exception as e: yield {"exception": e} raise CloseSpider("Exception in callback detected") diff --git a/tests/conftest.py b/tests/conftest.py index 0d9d5504..f3259e53 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,17 +1,8 @@ import pytest -from scrapy.settings import Settings + +from scrapy_poet.utils.testing import create_scrapy_settings @pytest.fixture() def settings(request): - """Default scrapy-poet settings""" - s = dict( - # collect scraped items to .collected_items attribute - ITEM_PIPELINES={ - "tests.utils.CollectorPipeline": 100, - }, - DOWNLOADER_MIDDLEWARES={ - "scrapy_poet.InjectionMiddleware": 543, - }, - ) - return Settings(s) + return create_scrapy_settings(request) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py deleted file mode 100644 index 01baf584..00000000 --- a/tests/po_lib/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -""" -This package is just for overrides testing purposes. -""" -import socket - -from url_matcher.util import get_domain -from web_poet import WebPage, handle_urls - -from tests.mockserver import get_ephemeral_port - -# Need to define it here since it's always changing -DOMAIN = get_domain(socket.gethostbyname(socket.gethostname())) -PORT = get_ephemeral_port() - - -class POOverriden(WebPage): - def to_item(self): - return {"msg": "PO that will be replaced"} - - -@handle_urls(f"{DOMAIN}:{PORT}", instead_of=POOverriden) -class POIntegration(WebPage): - def to_item(self): - return {"msg": "PO replacement"} diff --git a/tests/test_cache.py b/tests/test_cache.py index 73a8e4fc..14fd8e4e 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -4,8 +4,8 @@ from scrapy import Request, Spider from web_poet import WebPage, field -from .mockserver import MockServer -from .utils import EchoResource, make_crawler +from scrapy_poet.utils.mockserver import MockServer +from scrapy_poet.utils.testing import EchoResource, make_crawler @inlineCallbacks diff --git a/tests/test_callback_for.py b/tests/test_callback_for.py index 752afe75..61f3112b 100644 --- a/tests/test_callback_for.py +++ b/tests/test_callback_for.py @@ -127,16 +127,3 @@ def test_inline_callback_async(): msg = f"Function {cb} is not an instance method in: {spider}" assert str(exc.value) == msg - - -def test_invalid_subclass(): - """Classes should inherit from ItemPage.""" - - class MyClass(object): - pass - - with pytest.raises(TypeError) as exc: - callback_for(MyClass) - - msg = "MyClass should be a subclass of ItemPage." - assert str(exc.value) == msg diff --git a/tests/test_downloader.py b/tests/test_downloader.py index 89a513bb..f349aa09 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -19,11 +19,11 @@ from scrapy_poet import DummyResponse from scrapy_poet.downloader import create_scrapy_downloader from scrapy_poet.utils import http_request_to_scrapy_request -from tests.utils import ( +from scrapy_poet.utils.mockserver import MockServer +from scrapy_poet.utils.testing import ( AsyncMock, DelayedResource, EchoResource, - MockServer, StatusResource, make_crawler, ) diff --git a/tests/test_injection.py b/tests/test_injection.py index 07ed895d..84fb4605 100644 --- a/tests/test_injection.py +++ b/tests/test_injection.py @@ -1,4 +1,3 @@ -import weakref from typing import Any, Callable, Sequence, Set import attr @@ -9,7 +8,7 @@ from scrapy.http import Response from url_matcher import Patterns from url_matcher.util import get_domain -from web_poet import Injectable, ItemPage +from web_poet import Injectable, ItemPage, RulesRegistry from web_poet.mixins import ResponseShortcutsMixin from web_poet.rules import ApplyRule @@ -30,7 +29,6 @@ NonCallableProviderError, UndeclaredProvidedTypeError, ) -from scrapy_poet.overrides import OverridesRegistry def get_provider(classes, content=None): @@ -116,12 +114,6 @@ def test_constructor(self): == provider.require_response ) - # Asserting that we are not leaking providers references - weak_ref = weakref.ref(injector.providers[0]) - assert weak_ref() - del injector - assert weak_ref() is None - def test_non_callable_provider_error(self): """Checks that a exception is raised when a provider is not callable""" @@ -323,14 +315,14 @@ def test_overrides(self, providers, override_should_happen): domain = "example.com" if override_should_happen else "other-example.com" # The request domain is example.com, so overrides shouldn't be applied # when we configure them for domain other-example.com - overrides = [ - (domain, PriceInDollarsPO, PricePO), + rules = [ + ApplyRule(Patterns([domain]), use=PriceInDollarsPO, instead_of=PricePO), ApplyRule( Patterns([domain]), use=OtherEurDollarRate, instead_of=EurDollarRate ), ] - registry = OverridesRegistry(overrides) - injector = get_injector_for_testing(providers, overrides_registry=registry) + registry = RulesRegistry(rules=rules) + injector = get_injector_for_testing(providers, registry=registry) def callback( response: DummyResponse, price_po: PricePO, rate_po: EurDollarRate diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 216b551d..e2d8c656 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -12,15 +12,18 @@ from scrapy.utils.test import get_crawler from twisted.internet.threads import deferToThread from url_matcher.util import get_domain -from web_poet import default_registry -from web_poet.page_inputs import HttpResponse, RequestUrl, ResponseUrl -from web_poet.pages import ItemPage, WebPage +from web_poet import ApplyRule, HttpResponse, ItemPage, RequestUrl, ResponseUrl, WebPage from scrapy_poet import DummyResponse, InjectionMiddleware, callback_for from scrapy_poet.cache import SqlitedictCache from scrapy_poet.page_input_providers import PageObjectInputProvider -from tests.mockserver import get_ephemeral_port -from tests.utils import HtmlResource, capture_exceptions, crawl_items, crawl_single_item +from scrapy_poet.utils.mockserver import get_ephemeral_port +from scrapy_poet.utils.testing import ( + HtmlResource, + capture_exceptions, + crawl_items, + crawl_single_item, +) class ProductHtml(HtmlResource): @@ -102,8 +105,12 @@ def test_overrides(settings): host = socket.gethostbyname(socket.gethostname()) domain = get_domain(host) port = get_ephemeral_port() - settings["SCRAPY_POET_OVERRIDES"] = [ - (f"{domain}:{port}", OverridenBreadcrumbsExtraction, BreadcrumbsExtraction) + settings["SCRAPY_POET_RULES"] = [ + ApplyRule( + f"{domain}:{port}", + use=OverridenBreadcrumbsExtraction, + instead_of=BreadcrumbsExtraction, + ) ] item, url, _ = yield crawl_single_item( spider_for(ProductPage), ProductHtml, settings, port=port @@ -117,6 +124,18 @@ def test_overrides(settings): } +def test_deprecation_setting_SCRAPY_POET_OVERRIDES(settings) -> None: + settings["SCRAPY_POET_OVERRIDES"] = [] + crawler = get_crawler(Spider, settings) + + msg = ( + "The SCRAPY_POET_OVERRIDES setting is deprecated. " + "Use SCRAPY_POET_RULES instead." + ) + with pytest.warns(DeprecationWarning, match=msg): + InjectionMiddleware(crawler) + + @attr.s(auto_attribs=True) class OptionalAndUnionPage(WebPage): breadcrumbs: BreadcrumbsExtraction @@ -222,12 +241,12 @@ def test_providers(settings, type_): @inlineCallbacks -def test_providers_returning_wrong_classes(settings): +def test_providers_returning_wrong_classes(settings, caplog): """Injection Middleware should raise a runtime error whenever a provider returns instances of classes that they're not supposed to provide. """ - with pytest.raises(AssertionError): - yield crawl_single_item(spider_for(ExtraClassData), ProductHtml, settings) + yield crawl_single_item(spider_for(ExtraClassData), ProductHtml, settings) + assert "UndeclaredProvidedTypeError:" in caplog.text class MultiArgsCallbackSpider(scrapy.Spider): @@ -468,29 +487,3 @@ def get_middleware(settings): mock.call("/tmp/cache", compressed=True), mock.call().close(), ] - - -@inlineCallbacks -def test_web_poet_integration(settings): - """This tests scrapy-poet's integration with web-poet most especially when - populating override settings via: - - from web_poet import default_registry - - SCRAPY_POET_OVERRIDES = default_registry.get_rules() - """ - - # Only import them in this test scope since they need to be synced with - # the URL of the Page Object annotated with @handle_urls. - from tests.po_lib import PORT, POOverriden - - # Override rules are defined in `tests/po_lib/__init__.py`. - rules = default_registry.get_rules() - - # Converting it to a set removes potential duplicate ApplyRules - settings["SCRAPY_POET_OVERRIDES"] = set(rules) - - item, url, _ = yield crawl_single_item( - spider_for(POOverriden), ProductHtml, settings, port=PORT - ) - assert item == {"msg": "PO replacement"} diff --git a/tests/test_providers.py b/tests/test_providers.py index 87db3fde..dcb84f80 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -6,20 +6,21 @@ import scrapy from pytest_twisted import ensureDeferred, inlineCallbacks from scrapy import Request, Spider -from scrapy.crawler import Crawler from scrapy.settings import Settings from scrapy.utils.test import get_crawler from twisted.python.failure import Failure from web_poet import HttpClient, HttpResponse from scrapy_poet import HttpResponseProvider +from scrapy_poet.injection import Injector from scrapy_poet.page_input_providers import ( CacheDataProviderMixin, HttpClientProvider, + ItemProvider, PageObjectInputProvider, PageParamsProvider, ) -from tests.utils import AsyncMock, HtmlResource, crawl_single_item +from scrapy_poet.utils.testing import AsyncMock, HtmlResource, crawl_single_item class ProductHtml(HtmlResource): @@ -64,9 +65,9 @@ class PriceHtmlDataProvider(PageObjectInputProvider, CacheDataProviderMixin): name = "price_html" provided_classes = {Price, Html} - def __init__(self, crawler: Crawler): - assert isinstance(crawler, Crawler) - super().__init__(crawler) + def __init__(self, injector: Injector): + assert isinstance(injector, Injector) + super().__init__(injector) def __call__( self, to_provide, response: scrapy.http.Response, spider: scrapy.Spider @@ -207,7 +208,8 @@ def test_price_first_spider(settings): def test_response_data_provider_fingerprint(settings): crawler = get_crawler(Spider, settings) - rdp = HttpResponseProvider(crawler) + injector = Injector(crawler) + rdp = HttpResponseProvider(injector) request = scrapy.http.Request("https://example.com") # The fingerprint should be readable since it's JSON-encoded. @@ -219,11 +221,12 @@ def test_response_data_provider_fingerprint(settings): async def test_http_client_provider(settings): crawler = get_crawler(Spider, settings) crawler.engine = AsyncMock() + injector = Injector(crawler) with mock.patch( "scrapy_poet.page_input_providers.create_scrapy_downloader" ) as mock_factory: - provider = HttpClientProvider(crawler) + provider = HttpClientProvider(injector) results = provider(set(), crawler) assert isinstance(results[0], HttpClient) @@ -232,7 +235,8 @@ async def test_http_client_provider(settings): def test_page_params_provider(settings): crawler = get_crawler(Spider, settings) - provider = PageParamsProvider(crawler) + injector = Injector(crawler) + provider = PageParamsProvider(injector) request = scrapy.http.Request("https://example.com") results = provider(set(), request) @@ -251,3 +255,31 @@ def test_page_params_provider(settings): results = provider(set(), request) assert results[0] == expected_data + + +def test_item_provider_cache(settings): + """Note that the bulk of the tests for the ``ItemProvider`` alongside the + ``Injector`` is tested in ``tests/test_web_poet_rules.py``. + + We'll only test its caching behavior here if its properly garbage collected. + """ + + crawler = get_crawler(Spider, settings) + injector = Injector(crawler) + provider = ItemProvider(injector) + + assert len(provider._cached_instances) == 0 + + def inside(): + request = Request("https://example.com") + provider.update_cache(request, {Name: Name("test")}) + assert len(provider._cached_instances) == 1 + + cached_instance = provider.get_from_cache(request, Name) + assert isinstance(cached_instance, Name) + + # The cache should be empty after the ``inside`` scope has finished which + # means that the corresponding ``request`` and the contents under it are + # garbage collected. + inside() + assert len(provider._cached_instances) == 0 diff --git a/tests/test_retries.py b/tests/test_retries.py index 71f5ecbb..380815a1 100644 --- a/tests/test_retries.py +++ b/tests/test_retries.py @@ -7,7 +7,8 @@ from web_poet.page_inputs.http import HttpResponse from web_poet.pages import WebPage -from tests.utils import EchoResource, MockServer, make_crawler +from scrapy_poet.utils.mockserver import MockServer +from scrapy_poet.utils.testing import EchoResource, make_crawler class BaseSpider(Spider): diff --git a/tests/test_scrapy_dependencies.py b/tests/test_scrapy_dependencies.py index 660c075a..3546d2f9 100644 --- a/tests/test_scrapy_dependencies.py +++ b/tests/test_scrapy_dependencies.py @@ -10,7 +10,7 @@ HttpResponseProvider, PageObjectInputProvider, ) -from tests.utils import HtmlResource, crawl_items, crawl_single_item +from scrapy_poet.utils.testing import HtmlResource, crawl_items, crawl_single_item class ProductHtml(HtmlResource): diff --git a/tests/test_web_poet_rules.py b/tests/test_web_poet_rules.py new file mode 100644 index 00000000..46588da2 --- /dev/null +++ b/tests/test_web_poet_rules.py @@ -0,0 +1,1531 @@ +"""This tests scrapy-poet's integration with web-poet's ``ApplyRule`` specifically +when used for callback dependencies. + +Most of the logic here tests the behavior of the ``scrapy_poet/injection.py`` +and ``scrapy_poet/registry.py`` modules. +""" + +import socket +import warnings +from collections import defaultdict +from typing import Any, Dict, List, Tuple, Type + +import attrs +import pytest +import scrapy +from pytest_twisted import inlineCallbacks +from url_matcher import Patterns +from url_matcher.util import get_domain +from web_poet import ( + ApplyRule, + Injectable, + ItemPage, + WebPage, + default_registry, + field, + handle_urls, + item_from_fields, +) +from web_poet.pages import ItemT + +from scrapy_poet import callback_for +from scrapy_poet.downloadermiddlewares import DEFAULT_PROVIDERS +from scrapy_poet.utils.mockserver import get_ephemeral_port +from scrapy_poet.utils.testing import ( + capture_exceptions, + crawl_single_item, + create_scrapy_settings, +) +from tests.test_middleware import ProductHtml + +DOMAIN = get_domain(socket.gethostbyname(socket.gethostname())) +PORT = get_ephemeral_port() +URL = f"{DOMAIN}:{PORT}" + + +def rules_settings() -> dict: + settings = create_scrapy_settings(None) + settings["SCRAPY_POET_RULES"] = default_registry.get_rules() + return settings + + +def spider_for(injectable: Type): + class InjectableSpider(scrapy.Spider): + + url = None + custom_settings = { + "SCRAPY_POET_PROVIDERS": DEFAULT_PROVIDERS, + } + + def start_requests(self): + yield scrapy.Request(self.url, capture_exceptions(callback_for(injectable))) + + return InjectableSpider + + +class PageObjectCounterMixin: + """Inherited by some POs in a few of the tests which has a deep dependency + tree. + + This is mostly used to ensure that class instances are not duplicated when + building out the dependency tree as they could get expensive. + + For example, a PO could have its ``.to_item()`` method called multiple times + to produce the same item. This is extremely wasteful should there be any + additional requests used to produce the item. + + ``ItemProvider`` should cache up such instances and prevent them from being + built again. + """ + + instances: Dict[Type, Any] = defaultdict(list) + to_item_call_count = 0 + + def __attrs_pre_init__(self): + self.instances[type(self)].append(self) + + @classmethod + def assert_instance_count(cls, count, type_): + assert len(cls.instances[type_]) == count, type_ + + @classmethod + def clear(cls): + for po_cls in cls.instances.keys(): + po_cls.to_item_call_count = 0 + cls.instances = defaultdict(list) + + async def to_item(self) -> ItemT: + type(self).to_item_call_count += 1 + return await super().to_item() + + +@inlineCallbacks +def crawl_item_and_deps(PageObject) -> Tuple[Any, Any]: + """Helper function to easily return the item and injected dependencies from + a simulated Scrapy callback which asks for either of these dependencies: + - page object + - item class + """ + item, _, crawler = yield crawl_single_item( + spider_for(PageObject), ProductHtml, rules_settings(), port=PORT + ) + return item, crawler.spider.collected_response_deps + + +def assert_deps(deps: List[Dict[str, Any]], expected: Dict[str, Any], size: int = 1): + """Helper for easily checking the instances of the ``deps`` returned by + ``crawl_item_and_deps()``. + + The ``deps`` and ``expected`` follow a dict-formatted **kwargs parameters + that is passed to the spider callback. Currently, either "page" or "item" + are supported as keys since ``scrapy_poet.callback`` is used. + """ + assert len(deps) == size + if size == 0: + return + + # Only checks the first element for now since it's used alongside crawling + # a single item. + assert not deps[0].keys() - expected.keys() + assert all([True for k, v in expected.items() if isinstance(deps[0][k], v)]) + + +@handle_urls(URL) +class UrlMatchPage(ItemPage): + async def to_item(self) -> dict: + return {"msg": "PO URL Match"} + + +@inlineCallbacks +def test_url_only_match() -> None: + """Page Objects which only have URL in its ``@handle_urls`` annotation should + work. + """ + item, deps = yield crawl_item_and_deps(UrlMatchPage) + assert item == {"msg": "PO URL Match"} + assert_deps(deps, {"page": UrlMatchPage}) + + +@handle_urls("example.com") +class UrlNoMatchPage(ItemPage): + async def to_item(self) -> dict: + return {"msg": "PO No URL Match"} + + +@inlineCallbacks +def test_url_only_no_match() -> None: + """Same case as with ``test_url_only_match()`` but the URL specified in the + ``@handle_urls`` annotation doesn't match the request/response URL that the + spider is crawling. + + However, it should still work since we're forcing to use ``UrlNoMatchPage`` + specifically as the page object input. + """ + item, deps = yield crawl_item_and_deps(UrlNoMatchPage) + assert item == {"msg": "PO No URL Match"} + assert_deps(deps, {"page": UrlNoMatchPage}) + + +class NoRulePage(ItemPage): + async def to_item(self) -> dict: + return {"msg": "NO Rule"} + + +class NoRuleWebPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "NO Rule Web"} + + +@inlineCallbacks +def test_no_rule_declaration() -> None: + """A more extreme case of ``test_url_only_no_match()`` where the page object + doesn't have any rule declaration at all. + + But it should still work since we're enforcing the dependency. + """ + item, deps = yield crawl_item_and_deps(NoRulePage) + assert item == {"msg": "NO Rule"} + assert_deps(deps, {"page": NoRulePage}) + + item, deps = yield crawl_item_and_deps(NoRuleWebPage) + assert item == {"msg": "NO Rule Web"} + assert_deps(deps, {"page": NoRuleWebPage}) + + +class OverriddenPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "PO that will be replaced"} + + +@handle_urls(URL, instead_of=OverriddenPage) +class ReplacementPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "PO replacement"} + + +@inlineCallbacks +def test_basic_overrides() -> None: + """Basic overrides use case. + + If a page object is asked for and it's available in a rule's ``instead_of`` + parameter, it would be replaced by the page object inside the rule's ``use`` + parameter. + """ + item, deps = yield crawl_item_and_deps(OverriddenPage) + assert item == {"msg": "PO replacement"} + assert_deps(deps, {"page": ReplacementPage}) + + # Calling the replacement should also still work + item, deps = yield crawl_item_and_deps(ReplacementPage) + assert item == {"msg": "PO replacement"} + assert_deps(deps, {"page": ReplacementPage}) + + +class LeftPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "left page"} + + +@handle_urls(URL, instead_of=LeftPage) +class RightPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "right page"} + + +# We define it here since it errors out if made as a decorator +# since RightPage doesn't exist yet. +handle_urls(URL, instead_of=RightPage)(LeftPage) + + +@inlineCallbacks +def test_mutual_overrides() -> None: + """Two page objects that override each other should not present any problems. + + In practice, this isn't useful at all. + + TODO: We could present a warning to the user if this is detected. + THOUGHTS: + Although I doubt this would be common in most code bases since this would + only be possible if we do `handle_urls(URL, instead_of=RightPage)(LeftPage)` + which is highly unnatural. + + Another instance that it might occur is when users don't use `handle_urls()` + to write the rules but create a list of `ApplyRules` manually and passing + them to the `SCRAPY_POET_RULES` setting. I'm also not sure how common + this would be against simply using `@handle_urls()`. + + Let's hold off this potential warning mechanism until we observe that it + actually affects users. + """ + item, deps = yield crawl_item_and_deps(LeftPage) + assert item == {"msg": "right page"} + assert_deps(deps, {"page": RightPage}) + + item, deps = yield crawl_item_and_deps(RightPage) + assert item == {"msg": "left page"} + assert_deps(deps, {"page": LeftPage}) + + +@handle_urls(URL) +class NewHopePage(WebPage): + async def to_item(self) -> dict: + return {"msg": "new hope"} + + +@handle_urls(URL, instead_of=NewHopePage) +class EmpireStrikesBackPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "empire strikes back"} + + +@handle_urls(URL, instead_of=EmpireStrikesBackPage) +class ReturnOfTheJediPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "return of the jedi"} + + +@inlineCallbacks +def test_chained_overrides() -> None: + """If 3 overrides are connected to each other, there wouldn't be any + transitivity than spans the 3 POs. + """ + item, deps = yield crawl_item_and_deps(NewHopePage) + assert item == {"msg": "empire strikes back"} + assert_deps(deps, {"page": EmpireStrikesBackPage}) + + # Calling the other PO should still work + + item, deps = yield crawl_item_and_deps(EmpireStrikesBackPage) + assert item == {"msg": "return of the jedi"} + assert_deps(deps, {"page": ReturnOfTheJediPage}) + + item, deps = yield crawl_item_and_deps(ReturnOfTheJediPage) + assert item == {"msg": "return of the jedi"} + assert_deps(deps, {"page": ReturnOfTheJediPage}) + + +@handle_urls(URL) +class FirstPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "First page"} + + +@handle_urls(URL) +class SecondPage(WebPage): + async def to_item(self) -> dict: + return {"msg": "Second page"} + + +@handle_urls(URL, instead_of=FirstPage) +@handle_urls(URL, instead_of=SecondPage) +class MultipleRulePage(WebPage): + async def to_item(self) -> dict: + return {"msg": "multiple rule page"} + + +@inlineCallbacks +def test_multiple_rules_single_page_object() -> None: + """A single PO could be used by multiple other rules.""" + item, deps = yield crawl_item_and_deps(FirstPage) + assert item == {"msg": "multiple rule page"} + assert_deps(deps, {"page": MultipleRulePage}) + + item, deps = yield crawl_item_and_deps(SecondPage) + assert item == {"msg": "multiple rule page"} + assert_deps(deps, {"page": MultipleRulePage}) + + # Calling the replacement should also still work + item, deps = yield crawl_item_and_deps(MultipleRulePage) + assert item == {"msg": "multiple rule page"} + assert_deps(deps, {"page": MultipleRulePage}) + + +@attrs.define +class Product: + name: str + + +@handle_urls(URL) +class ProductPage(ItemPage[Product]): + @field + def name(self) -> str: + return "product name" + + +@inlineCallbacks +def test_basic_item_return() -> None: + """Basic item use case. + + If an item class is asked for and it's available in some rule's ``to_return`` + parameter, an item class's instance shall be produced by the page object + declared inside the rule's ``use`` parameter. + """ + item, deps = yield crawl_item_and_deps(Product) + assert item == Product(name="product name") + assert_deps(deps, {"item": Product}) + + # calling the actual page object should also work + item, deps = yield crawl_item_and_deps(ProductPage) + assert item == Product(name="product name") + assert_deps(deps, {"page": ProductPage}) + + +@attrs.define +class ItemButNoPageObject: + name: str + + +@inlineCallbacks +def test_basic_item_but_no_page_object() -> None: + """When an item is requested as a dependency but there's no page object that's + assigned to it in any of the given ``ApplyRule``, it would result to an error + in the spider callback since + """ + expected_msg = r"parse\(\) missing 1 required keyword-only argument: 'item'" + with pytest.raises(TypeError, match=expected_msg): + yield crawl_item_and_deps(ItemButNoPageObject) + + +@attrs.define +class ItemWithPageObjectButForDifferentUrl: + name: str + + +@handle_urls(URL + "/some-wrong-path") +class DifferentUrlPage(ItemPage[ItemWithPageObjectButForDifferentUrl]): + @field + def name(self) -> str: + return "wrong url" + + +@inlineCallbacks +def test_basic_item_with_page_object_but_different_url() -> None: + """If an item has been requested and a page object can produce it, but the + URL pattern is different, the item won't be produced at all. + + For these cases, a warning should be issued since the user might have written + some incorrect URL Pattern for the ``ApplyRule``. + """ + msg = ( + "Can't find appropriate page object for item for url: " + ) + with warnings.catch_warnings(record=True) as caught_warnings: + item, deps = yield crawl_item_and_deps(ItemWithPageObjectButForDifferentUrl) + assert any([True for w in caught_warnings if msg in str(w.message)]) + assert item is None + assert not deps + + +@attrs.define +class ProductFromParent: + name: str + + +@handle_urls(URL) +class ParentProductPage(ItemPage[ProductFromParent]): + @field + def name(self) -> str: + return "parent product name" + + +@handle_urls(URL) +class SubclassProductPage(ParentProductPage): + @field + def name(self) -> str: + return "subclass product name" + + +@inlineCallbacks +def test_item_return_subclass() -> None: + """A page object should properly derive the ``Return[ItemType]`` that it + inherited from its parent. + + In this test case, there's a clash for the ``url_matcher.Patterns`` since + they're exactly the same. This produces a warning message. For conflicts + like this, scrapy-poet follows the first ``ApplyRule`` it finds inside the + ``SCRAPY_POET_RULES`` setting. + + To remove this warning, the user should update the priority in + ``url_matcher.Patterns`` which is set in ``ApplyRule.for_patterns``. + """ + + # There should be a warning to the user about clashing rules. + rules = [ + ApplyRule(URL, use=ParentProductPage, to_return=ProductFromParent), + ApplyRule(URL, use=SubclassProductPage, to_return=ProductFromParent), + ] + msg = f"Consider updating the priority of these rules: {rules}" + + with warnings.catch_warnings(record=True) as caught_warnings: + item, deps = yield crawl_item_and_deps(ProductFromParent) + assert any([True for w in caught_warnings if msg in str(w.message)]) + + assert item == ProductFromParent(name="subclass product name") + assert_deps(deps, {"item": ProductFromParent}) + + # calling the actual page objects should still work + + item, deps = yield crawl_item_and_deps(ParentProductPage) + assert item == ProductFromParent(name="parent product name") + assert_deps(deps, {"page": ParentProductPage}) + + item, deps = yield crawl_item_and_deps(SubclassProductPage) + assert item == ProductFromParent(name="subclass product name") + assert_deps(deps, {"page": SubclassProductPage}) + + +@attrs.define +class AItem: + name: str + + +@handle_urls(URL) +class IndependentA1Page(ItemPage[AItem]): + @field + def name(self) -> str: + return "independent A1" + + +@handle_urls(URL) +class IndependentA2Page(ItemPage[AItem]): + @field + def name(self) -> str: + return "independent A2" + + +@inlineCallbacks +def test_item_return_individually_defined() -> None: + """Same case with ``test_item_return_subclass()`` but the 'to_return' item + has not been derived due to subclassing but rather, two page objects were + defined independently that returns the same item. + + The latter rule that was defined should be used when the priorities are the + same. + """ + rules = [ + ApplyRule(URL, use=IndependentA1Page, to_return=AItem), + ApplyRule(URL, use=IndependentA2Page, to_return=AItem), + ] + msg = f"Consider updating the priority of these rules: {rules}" + + with warnings.catch_warnings(record=True) as caught_warnings: + item, deps = yield crawl_item_and_deps(AItem) + assert any([True for w in caught_warnings if msg in str(w.message)]) + + assert item == AItem(name="independent A2") + assert_deps(deps, {"item": IndependentA2Page}) + + # calling the actual page objects should still work + + item, deps = yield crawl_item_and_deps(IndependentA1Page) + assert item == AItem(name="independent A1") + assert_deps(deps, {"page": IndependentA1Page}) + + item, deps = yield crawl_item_and_deps(IndependentA2Page) + assert item == AItem(name="independent A2") + assert_deps(deps, {"page": IndependentA2Page}) + + +@attrs.define +class PriorityProductFromParent: + name: str + + +@handle_urls(URL, priority=600) +class PriorityParentProductPage(ItemPage[PriorityProductFromParent]): + @field + def name(self) -> str: + return "priority parent product name" + + +@handle_urls(URL) +class PrioritySubclassProductPage(PriorityParentProductPage): + @field + def name(self) -> str: + return "priority subclass product name" + + +@inlineCallbacks +def test_item_return_parent_priority() -> None: + """Same case as with ``test_item_return_subclass()`` but now the parent PO + uses a higher priority of 600 than the default 500. + """ + rules = [ + ApplyRule( + URL, use=PriorityParentProductPage, to_return=PriorityProductFromParent + ), + ApplyRule( + URL, use=PrioritySubclassProductPage, to_return=PriorityProductFromParent + ), + ] + msg = f"Consider updating the priority of these rules: {rules}" + + with warnings.catch_warnings(record=True) as caught_warnings: + item, deps = yield crawl_item_and_deps(PriorityProductFromParent) + assert not any([True for w in caught_warnings if msg in str(w.message)]) + + assert item == PriorityProductFromParent(name="priority parent product name") + assert_deps(deps, {"item": PriorityProductFromParent}) + + # calling the actual page objects should still work + + item, deps = yield crawl_item_and_deps(PriorityParentProductPage) + assert item == PriorityProductFromParent(name="priority parent product name") + assert_deps(deps, {"page": PriorityParentProductPage}) + + item, deps = yield crawl_item_and_deps(PrioritySubclassProductPage) + assert item == PriorityProductFromParent(name="priority subclass product name") + assert_deps(deps, {"page": PriorityParentProductPage}) + + +@attrs.define +class BItem: + name: str + + +@handle_urls(URL, priority=600) +class IndependentB1Page(ItemPage[BItem]): + @field + def name(self) -> str: + return "independent B1" + + +@handle_urls(URL) +class IndependentB2Page(ItemPage[BItem]): + @field + def name(self) -> str: + return "independent B2" + + +@inlineCallbacks +def test_item_return_individually_defined_first_rule_higher_priority() -> None: + """Same case with ``test_item_return_parent_priority()`` but the 'to_return' + item has not been derived due to subclassing but rather, two page objects + were defined independently that returns the same item. + + The rule with the higher priority should be used regardless of the order it + was defined. + """ + rules = [ + ApplyRule(URL, use=IndependentB1Page, to_return=BItem), + ApplyRule(URL, use=IndependentB2Page, to_return=BItem), + ] + msg = f"Consider updating the priority of these rules: {rules}" + + with warnings.catch_warnings(record=True) as caught_warnings: + item, deps = yield crawl_item_and_deps(BItem) + assert not any([True for w in caught_warnings if msg in str(w.message)]) + + assert item == BItem(name="independent B1") + assert_deps(deps, {"item": IndependentB1Page}) + + # calling the actual page objects should still work + + item, deps = yield crawl_item_and_deps(IndependentB1Page) + assert item == BItem(name="independent B1") + assert_deps(deps, {"page": IndependentB1Page}) + + item, deps = yield crawl_item_and_deps(IndependentB2Page) + assert item == BItem(name="independent B2") + assert_deps(deps, {"page": IndependentB2Page}) + + +@attrs.define +class PriorityProductFromSubclass: + name: str + + +@handle_urls(URL) +class Priority2ParentProductPage(ItemPage[PriorityProductFromSubclass]): + @field + def name(self) -> str: + return "priority parent product name" + + +@handle_urls(URL, priority=600) +class Priority2SubclassProductPage(Priority2ParentProductPage): + @field + def name(self) -> str: + return "priority subclass product name" + + +@inlineCallbacks +def test_item_return_subclass_priority() -> None: + """Same case as with ``test_item_return_parent_priority()`` but now the + PO subclass uses a higher priority of 600 than the default 500. + """ + rules = [ + ApplyRule( + URL, use=Priority2ParentProductPage, to_return=PriorityProductFromSubclass + ), + ApplyRule( + URL, use=Priority2SubclassProductPage, to_return=PriorityProductFromSubclass + ), + ] + msg = f"Consider updating the priority of these rules: {rules}" + + with warnings.catch_warnings(record=True) as caught_warnings: + item, deps = yield crawl_item_and_deps(PriorityProductFromSubclass) + assert not any([True for w in caught_warnings if msg in str(w.message)]) + + assert item == PriorityProductFromSubclass(name="priority subclass product name") + assert_deps(deps, {"item": PriorityProductFromSubclass}) + + # calling the actual page objects should still work + + item, deps = yield crawl_item_and_deps(Priority2ParentProductPage) + assert item == PriorityProductFromSubclass(name="priority parent product name") + assert_deps(deps, {"page": Priority2ParentProductPage}) + + item, deps = yield crawl_item_and_deps(Priority2SubclassProductPage) + assert item == PriorityProductFromSubclass(name="priority subclass product name") + assert_deps(deps, {"page": Priority2ParentProductPage}) + + +@attrs.define +class CItem: + name: str + + +@handle_urls(URL) +class IndependentC1Page(ItemPage[CItem]): + @field + def name(self) -> str: + return "independent C1" + + +@handle_urls(URL, priority=600) +class IndependentC2Page(ItemPage[CItem]): + @field + def name(self) -> str: + return "independent C2" + + +@inlineCallbacks +def test_item_return_individually_defined_second_rule_higher_priority() -> None: + """Same case with ``test_item_return_subclass_priority()`` but the 'to_return' + item has not been derived due to subclassing but rather, two page objects + were defined independently that returns the same item. + + The rule with the higher priority should be used regardless of the order it + was defined. + """ + rules = [ + ApplyRule(URL, use=IndependentC1Page, to_return=CItem), + ApplyRule(URL, use=IndependentC2Page, to_return=CItem), + ] + msg = f"Consider updating the priority of these rules: {rules}" + + with warnings.catch_warnings(record=True) as caught_warnings: + item, deps = yield crawl_item_and_deps(CItem) + assert not any([True for w in caught_warnings if msg in str(w.message)]) + + assert item == CItem(name="independent C2") + assert_deps(deps, {"item": IndependentC2Page}) + + item, deps = yield crawl_item_and_deps(IndependentC1Page) + assert item == CItem(name="independent C1") + assert_deps(deps, {"page": IndependentC1Page}) + + item, deps = yield crawl_item_and_deps(IndependentC2Page) + assert item == CItem(name="independent C2") + assert_deps(deps, {"page": IndependentC2Page}) + + +@attrs.define +class ReplacedProduct: + name: str + + +@handle_urls(URL, to_return=ReplacedProduct) +class ReplacedProductPage(ItemPage[Product]): + @field + def name(self) -> str: + return "replaced product name" + + +@pytest.mark.xfail( + reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " + "ItemProvide has received a different type of item class from the page object." +) +@inlineCallbacks +def test_item_to_return_in_handle_urls() -> None: + """Even if ``@handle_urls`` could derive the value for the ``to_return`` + parameter when the class inherits from something like ``ItemPage[ItemType]``, + any value passed through its ``to_return`` parameter should take precedence. + + Note that that this produces some inconsistencies between the rule's item + class vs the class that is actually returned. Using the ``to_return`` + parameter in ``@handle_urls`` isn't recommended because of this. + """ + item, deps = yield crawl_item_and_deps(ReplacedProduct) + assert item == Product(name="replaced product name") + assert_deps(deps, {"page": ReplacedProductPage}) + + +@inlineCallbacks +def test_item_to_return_in_handle_urls_other() -> None: + """Remaining tests for ``test_item_to_return_in_handle_urls()`` which are + not expected to be xfail. + """ + # Requesting the underlying item class from the PO should still work. + item, deps = yield crawl_item_and_deps(Product) + assert item == Product(name="product name") + assert_deps(deps, {"item": Product}) + + # calling the actual page objects should still work + item, deps = yield crawl_item_and_deps(ReplacedProductPage) + assert item == Product(name="replaced product name") + assert_deps(deps, {"page": ReplacedProductPage}) + + +@attrs.define +class ParentReplacedProduct: + name: str + + +@attrs.define +class SubclassReplacedProduct: + name: str + + +@handle_urls(URL) +class ParentReplacedProductPage(ItemPage[ParentReplacedProduct]): + @field + def name(self) -> str: + return "parent replaced product name" + + +@handle_urls(URL, to_return=SubclassReplacedProduct) +class SubclassReplacedProductPage(ParentReplacedProductPage): + @field + def name(self) -> str: + return "subclass replaced product name" + + +@pytest.mark.xfail( + reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " + "ItemProvide has received a different type of item class from the page object." +) +@inlineCallbacks +def test_item_to_return_in_handle_urls_subclass() -> None: + """Same case as with the ``test_item_to_return_in_handle_urls()`` case above + but the ``to_return`` is declared in the subclass. + """ + item, deps = yield crawl_item_and_deps(SubclassReplacedProduct) + assert item == ParentReplacedProduct(name="subclass replaced product name") + assert_deps(deps, {"page": SubclassReplacedProductPage}) + + +@inlineCallbacks +def test_item_to_return_in_handle_urls_subclass_others() -> None: + """Remaining tests for ``test_item_to_return_in_handle_urls_subclass()`` + which are not expected to be xfail. + """ + # Requesting the underlying item class from the parent PO should still work. + item, deps = yield crawl_item_and_deps(ParentReplacedProduct) + assert item == ParentReplacedProduct(name="parent replaced product name") + assert_deps(deps, {"item": ParentReplacedProduct}) + + # calling the actual page objects should still work + + item, deps = yield crawl_item_and_deps(ParentReplacedProductPage) + assert item == ParentReplacedProduct(name="parent replaced product name") + assert_deps(deps, {"page": ParentReplacedProductPage}) + + item, deps = yield crawl_item_and_deps(SubclassReplacedProductPage) + assert item == ParentReplacedProduct(name="subclass replaced product name") + assert_deps(deps, {"page": SubclassReplacedProductPage}) + + +@attrs.define +class StandaloneProduct: + name: str + + +@handle_urls(URL, to_return=StandaloneProduct) +class StandaloneProductPage(ItemPage): + @field + def name(self) -> str: + return "standalone product name" + + +@pytest.mark.xfail( + reason="This currently causes an ``UndeclaredProvidedTypeError`` since the " + "ItemProvide has received a different type of item class from the page object." +) +@inlineCallbacks +def test_item_to_return_standalone() -> None: + """Same case as with ``test_item_to_return_in_handle_urls()`` above but the + page object doesn't inherit from something like ``ItemPage[ItemClass]`` + """ + item, deps = yield crawl_item_and_deps(StandaloneProduct) + assert item == {"name": "standalone product name"} + assert_deps(deps, {"page": StandaloneProductPage}) + + +@inlineCallbacks +def test_item_to_return_standalone_others() -> None: + """Remaining tests for ``test_item_to_return_standalone()`` + which are not expected to be xfail. + """ + + # calling the actual page object should still work + item, deps = yield crawl_item_and_deps(StandaloneProductPage) + assert item == {"name": "standalone product name"} + assert_deps(deps, {"page": StandaloneProductPage}) + + +@attrs.define +class ProductFromInjectable: + name: str + + +@handle_urls(URL, to_return=ProductFromInjectable) +class ProductFromInjectablePage(Injectable): + @field + def name(self) -> str: + return "product from injectable" + + async def to_item(self) -> ProductFromInjectable: + return await item_from_fields(self, item_cls=ProductFromInjectable) + + +@inlineCallbacks +def test_item_return_from_injectable() -> None: + """The case wherein a PageObject inherits directly from ``web_poet.Injectable`` + should also work, provided that the ``to_item`` method is similar with + ``web_poet.ItemPage``: + """ + item, deps = yield crawl_item_and_deps(ProductFromInjectable) + assert item == ProductFromInjectable(name="product from injectable") + assert_deps(deps, {"item": ProductFromInjectable}) + + # calling the actual page object should not work since it's not inheriting + # from ``web_poet.ItemPage``. + item, deps = yield crawl_item_and_deps(ProductFromInjectablePage) + assert item is None + + # However, the page object should still be injected into the callback method. + assert_deps(deps, {"item": ProductFromInjectablePage}) + + +@handle_urls(URL) +class PageObjectDependencyPage(ItemPage): + async def to_item(self) -> dict: + return {"name": "item dependency"} + + +@attrs.define +class MainProductA: + name: str + item_from_po_dependency: dict + + +class ReplacedProductPageObjectDepPage(ItemPage): + pass + + +@handle_urls(URL, instead_of=ReplacedProductPageObjectDepPage) +@attrs.define +class ProductWithPageObjectDepPage(ItemPage[MainProductA]): + injected_page: PageObjectDependencyPage + + @field + def name(self) -> str: + return "(with item dependency) product name" + + @field + async def item_from_po_dependency(self) -> dict: + return await self.injected_page.to_item() + + +@inlineCallbacks +def test_page_object_with_page_object_dependency() -> None: + # item from 'to_return' + item, deps = yield crawl_item_and_deps(MainProductA) + assert item == MainProductA( + name="(with item dependency) product name", + item_from_po_dependency={"name": "item dependency"}, + ) + assert_deps(deps, {"item": MainProductA}) + + # item from 'instead_of' + item, deps = yield crawl_item_and_deps(ReplacedProductPageObjectDepPage) + assert item == MainProductA( + name="(with item dependency) product name", + item_from_po_dependency={"name": "item dependency"}, + ) + assert_deps(deps, {"page": ProductWithPageObjectDepPage}) + + # calling the actual page objects should still work + + item, deps = yield crawl_item_and_deps(PageObjectDependencyPage) + assert item == {"name": "item dependency"} + assert_deps(deps, {"page": PageObjectDependencyPage}) + + item, deps = yield crawl_item_and_deps(ProductWithPageObjectDepPage) + assert item == MainProductA( + name="(with item dependency) product name", + item_from_po_dependency={"name": "item dependency"}, + ) + assert_deps(deps, {"page": ProductWithPageObjectDepPage}) + + +@attrs.define +class ItemDependency: + name: str + + +@handle_urls(URL) +@attrs.define +class ItemDependencyPage(PageObjectCounterMixin, ItemPage[ItemDependency]): + @field + def name(self) -> str: + return "item dependency" + + +@attrs.define +class MainProductB: + name: str + item_dependency: ItemDependency + + +class ReplacedProductItemDepPage(ItemPage): + pass + + +@handle_urls(URL, instead_of=ReplacedProductItemDepPage) +@attrs.define +class ProductWithItemDepPage(PageObjectCounterMixin, ItemPage[MainProductB]): + injected_item: ItemDependency + + @field + def name(self) -> str: + return "(with item dependency) product name" + + @field + def item_dependency(self) -> ItemDependency: + return self.injected_item + + +@inlineCallbacks +def test_page_object_with_item_dependency() -> None: + """Page objects with dependencies like item classes should have them resolved + by the page object assigned in one of the rules' ``use`` parameter. + """ + + # item from 'to_return' + item, deps = yield crawl_item_and_deps(MainProductB) + assert item == MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ) + assert_deps(deps, {"item": MainProductB}) + + # item from 'instead_of' + item, deps = yield crawl_item_and_deps(ReplacedProductItemDepPage) + assert item == MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ) + assert_deps(deps, {"page": ProductWithItemDepPage}) + + # calling the actual page objects should still work + + item, deps = yield crawl_item_and_deps(ItemDependencyPage) + assert item == ItemDependency(name="item dependency") + assert_deps(deps, {"page": ItemDependencyPage}) + + item, deps = yield crawl_item_and_deps(ProductWithItemDepPage) + assert item == MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ) + assert_deps(deps, {"page": ProductWithItemDepPage}) + + # Calling the original dependency should still work + item, deps = yield crawl_item_and_deps(ItemDependency) + assert item == ItemDependency(name="item dependency") + assert_deps(deps, {"item": ItemDependency}) + + +@attrs.define +class MainProductC: + name: str + item_dependency: ItemDependency + main_product_b_dependency: MainProductB + + +@handle_urls(URL) +@attrs.define +class ProductDeepDependencyPage(PageObjectCounterMixin, ItemPage[MainProductC]): + injected_item: ItemDependency + main_product_b_dependency_item: MainProductB + + @field + def name(self) -> str: + return "(with deep item dependency) product name" + + @field + def item_dependency(self) -> ItemDependency: + return self.injected_item + + @field + def main_product_b_dependency(self) -> MainProductB: + return self.main_product_b_dependency_item + + +@inlineCallbacks +def test_page_object_with_deep_item_dependency() -> None: + """This builds upon the earlier ``test_page_object_with_item_dependency()`` + but with another layer of item dependencies. + """ + + # item from 'to_return' + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(MainProductC) + assert item == MainProductC( + name="(with deep item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + main_product_b_dependency=MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ), + ) + assert_deps(deps, {"item": MainProductC}) + PageObjectCounterMixin.assert_instance_count(1, ItemDependencyPage) + PageObjectCounterMixin.assert_instance_count(1, ProductWithItemDepPage) + PageObjectCounterMixin.assert_instance_count(1, ProductDeepDependencyPage) + assert ItemDependencyPage.to_item_call_count == 1 + assert ProductWithItemDepPage.to_item_call_count == 1 + assert ProductDeepDependencyPage.to_item_call_count == 1 + + # calling the actual page objects should still work + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(ProductDeepDependencyPage) + assert item == MainProductC( + name="(with deep item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + main_product_b_dependency=MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ), + ) + assert_deps(deps, {"page": ProductDeepDependencyPage}) + PageObjectCounterMixin.assert_instance_count(1, ItemDependencyPage) + PageObjectCounterMixin.assert_instance_count(1, ProductWithItemDepPage) + PageObjectCounterMixin.assert_instance_count(1, ProductDeepDependencyPage) + assert ItemDependencyPage.to_item_call_count == 1 + assert ProductWithItemDepPage.to_item_call_count == 1 + assert ProductDeepDependencyPage.to_item_call_count == 1 + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(ItemDependencyPage) + assert item == ItemDependency(name="item dependency") + assert_deps(deps, {"page": ItemDependencyPage}) + PageObjectCounterMixin.assert_instance_count(1, ItemDependencyPage) + PageObjectCounterMixin.assert_instance_count(0, ProductWithItemDepPage) + PageObjectCounterMixin.assert_instance_count(0, ProductDeepDependencyPage) + assert ItemDependencyPage.to_item_call_count == 1 + assert ProductWithItemDepPage.to_item_call_count == 0 + assert ProductDeepDependencyPage.to_item_call_count == 0 + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(ProductWithItemDepPage) + assert item == MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ) + assert_deps(deps, {"page": ProductWithItemDepPage}) + PageObjectCounterMixin.assert_instance_count(1, ItemDependencyPage) + PageObjectCounterMixin.assert_instance_count(1, ProductWithItemDepPage) + PageObjectCounterMixin.assert_instance_count(0, ProductDeepDependencyPage) + assert ItemDependencyPage.to_item_call_count == 1 + assert ProductWithItemDepPage.to_item_call_count == 1 + assert ProductDeepDependencyPage.to_item_call_count == 0 + + # Calling the other item dependencies should still work + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(MainProductB) + assert item == MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ) + assert_deps(deps, {"item": MainProductB}) + PageObjectCounterMixin.assert_instance_count(1, ItemDependencyPage) + PageObjectCounterMixin.assert_instance_count(1, ProductWithItemDepPage) + PageObjectCounterMixin.assert_instance_count(0, ProductDeepDependencyPage) + assert ItemDependencyPage.to_item_call_count == 1 + assert ProductWithItemDepPage.to_item_call_count == 1 + assert ProductDeepDependencyPage.to_item_call_count == 0 + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(ItemDependency) + assert item == ItemDependency(name="item dependency") + assert_deps(deps, {"item": ItemDependency}) + PageObjectCounterMixin.assert_instance_count(1, ItemDependencyPage) + PageObjectCounterMixin.assert_instance_count(0, ProductWithItemDepPage) + PageObjectCounterMixin.assert_instance_count(0, ProductDeepDependencyPage) + assert ItemDependencyPage.to_item_call_count == 1 + assert ProductWithItemDepPage.to_item_call_count == 0 + assert ProductDeepDependencyPage.to_item_call_count == 0 + + +@attrs.define +class MainProductD: + name: str + item_dependency: ItemDependency + main_product_b_dependency: MainProductB + first_main_product_c_dependency: MainProductC + second_main_product_c_dependency: MainProductC + + +@handle_urls(URL) +@attrs.define +class ProductDuplicateDeepDependencyPage( + PageObjectCounterMixin, ItemPage[MainProductD] +): + injected_item: ItemDependency + main_product_b_dependency_item: MainProductB + first_main_product_c_dependency_item: MainProductC + second_main_product_c_dependency_item: MainProductC + + @field + def name(self) -> str: + return "(with duplicate deep item dependency) product name" + + @field + def item_dependency(self) -> ItemDependency: + return self.injected_item + + @field + def main_product_b_dependency(self) -> MainProductB: + return self.main_product_b_dependency_item + + @field + def first_main_product_c_dependency(self) -> MainProductC: + return self.first_main_product_c_dependency_item + + @field + def second_main_product_c_dependency(self) -> MainProductC: + return self.second_main_product_c_dependency_item + + +@inlineCallbacks +def test_page_object_with_duplicate_deep_item_dependency() -> None: + """This yet builds upon the earlier ``test_page_object_with_deep_item_dependency()`` + making it deeper. + + However, this one has some duplicated dependencies + """ + + # item from 'to_return' + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(MainProductD) + assert item == MainProductD( + name="(with duplicate deep item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + main_product_b_dependency=MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ), + first_main_product_c_dependency=MainProductC( + name="(with deep item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + main_product_b_dependency=MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ), + ), + second_main_product_c_dependency=MainProductC( + name="(with deep item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + main_product_b_dependency=MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ), + ), + ) + assert_deps(deps, {"item": MainProductD}) + PageObjectCounterMixin.assert_instance_count(1, ItemDependencyPage) + PageObjectCounterMixin.assert_instance_count(1, ProductWithItemDepPage) + PageObjectCounterMixin.assert_instance_count(1, ProductDeepDependencyPage) + PageObjectCounterMixin.assert_instance_count(1, ProductDuplicateDeepDependencyPage) + assert ItemDependencyPage.to_item_call_count == 1 + assert ProductWithItemDepPage.to_item_call_count == 1 + assert ProductDeepDependencyPage.to_item_call_count == 1 + assert ProductDuplicateDeepDependencyPage.to_item_call_count == 1 + + # calling the actual page objects should still work + + PageObjectCounterMixin.clear() + item, deps = yield crawl_item_and_deps(ProductDuplicateDeepDependencyPage) + assert item == MainProductD( + name="(with duplicate deep item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + main_product_b_dependency=MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ), + first_main_product_c_dependency=MainProductC( + name="(with deep item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + main_product_b_dependency=MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ), + ), + second_main_product_c_dependency=MainProductC( + name="(with deep item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + main_product_b_dependency=MainProductB( + name="(with item dependency) product name", + item_dependency=ItemDependency(name="item dependency"), + ), + ), + ) + assert_deps(deps, {"page": ProductDuplicateDeepDependencyPage}) + PageObjectCounterMixin.assert_instance_count(1, ItemDependencyPage) + PageObjectCounterMixin.assert_instance_count(1, ProductWithItemDepPage) + PageObjectCounterMixin.assert_instance_count(1, ProductDeepDependencyPage) + PageObjectCounterMixin.assert_instance_count(1, ProductDuplicateDeepDependencyPage) + assert ItemDependencyPage.to_item_call_count == 1 + assert ProductWithItemDepPage.to_item_call_count == 1 + assert ProductDeepDependencyPage.to_item_call_count == 1 + assert ProductDuplicateDeepDependencyPage.to_item_call_count == 1 + + +@attrs.define +class ChickenItem: + name: str + other: str + + +@attrs.define +class EggItem: + name: str + other: str + + +@handle_urls(URL) +@attrs.define +class ChickenDeadlockPage(ItemPage[ChickenItem]): + other_injected_item: EggItem + + @field + def name(self) -> str: + return "chicken" + + @field + def other(self) -> str: + return self.other_injected_item.name + + +@handle_urls(URL) +@attrs.define +class EggDeadlockPage(ItemPage[EggItem]): + other_injected_item: ChickenItem + + @field + def name(self) -> str: + return "egg" + + @field + def other(self) -> str: + return self.other_injected_item.name + + +@inlineCallbacks +def test_page_object_with_item_dependency_deadlock_a(caplog) -> None: + """Items with page objects which depend on each other resulting in a deadlock + should have a corresponding error raised. + """ + item, deps = yield crawl_item_and_deps(ChickenItem) + assert "ProviderDependencyDeadlockError" in caplog.text + + +@inlineCallbacks +def test_page_object_with_item_dependency_deadlock_b(caplog) -> None: + item, deps = yield crawl_item_and_deps(EggItem) + assert "ProviderDependencyDeadlockError" in caplog.text + + +@inlineCallbacks +def test_page_object_with_item_dependency_deadlock_c(caplog) -> None: + item, deps = yield crawl_item_and_deps(ChickenDeadlockPage) + assert "ProviderDependencyDeadlockError" in caplog.text + + +@inlineCallbacks +def test_page_object_with_item_dependency_deadlock_d(caplog) -> None: + item, deps = yield crawl_item_and_deps(EggDeadlockPage) + assert "ProviderDependencyDeadlockError" in caplog.text + + +@attrs.define +class Chicken2Item: + name: str + other: str + + +@attrs.define +class Egg2Item: + name: str + other: str + + +@handle_urls(URL) +@attrs.define +class Chicken2DeadlockPage(ItemPage[Chicken2Item]): + other_injected_item: Egg2Item + + @field + def name(self) -> str: + return "chicken 2" + + @field + def other(self) -> str: + return self.other_injected_item.name + + +@handle_urls(URL) +@attrs.define +class Egg2DeadlockPage(ItemPage[Egg2Item]): + other_injected_page: Chicken2DeadlockPage + + @field + def name(self) -> str: + return "egg 2" + + @field + async def other(self) -> str: + item = await self.other_injected_page.to_item() + return item.name + + +@inlineCallbacks +def test_page_object_with_item_dependency_deadlock_2_a(caplog) -> None: + """Same with ``test_page_object_with_item_dependency_deadlock()`` but one + of the page objects requires a page object instead of an item. + """ + item, deps = yield crawl_item_and_deps(Chicken2Item) + assert "ProviderDependencyDeadlockError" in caplog.text + + +@inlineCallbacks +def test_page_object_with_item_dependency_deadlock_2_b(caplog) -> None: + item, deps = yield crawl_item_and_deps(Egg2Item) + assert "ProviderDependencyDeadlockError" in caplog.text + + +@inlineCallbacks +def test_page_object_with_item_dependency_deadlock_2_c(caplog) -> None: + item, deps = yield crawl_item_and_deps(Chicken2DeadlockPage) + assert "ProviderDependencyDeadlockError" in caplog.text + + +@inlineCallbacks +def test_page_object_with_item_dependency_deadlock_2_d(caplog) -> None: + item, deps = yield crawl_item_and_deps(Egg2DeadlockPage) + assert "ProviderDependencyDeadlockError" in caplog.text + + +def test_created_apply_rules() -> None: + """Checks if the ``ApplyRules`` were created properly by ``@handle_urls`` in + ``tests/po_lib/__init__.py``. + """ + + RULES = default_registry.get_rules() + + assert RULES == [ + # URL declaration only + ApplyRule(URL, use=UrlMatchPage), + ApplyRule("example.com", use=UrlNoMatchPage), + # PageObject-based rules + ApplyRule(URL, use=ReplacementPage, instead_of=OverriddenPage), + ApplyRule(URL, use=RightPage, instead_of=LeftPage), + ApplyRule(URL, use=LeftPage, instead_of=RightPage), + ApplyRule(URL, use=NewHopePage), + ApplyRule(URL, use=EmpireStrikesBackPage, instead_of=NewHopePage), + ApplyRule(URL, use=ReturnOfTheJediPage, instead_of=EmpireStrikesBackPage), + ApplyRule(URL, use=FirstPage), + ApplyRule(URL, use=SecondPage), + ApplyRule(URL, use=MultipleRulePage, instead_of=SecondPage), + ApplyRule(URL, use=MultipleRulePage, instead_of=FirstPage), + # Item-based rules + ApplyRule(URL, use=ProductPage, to_return=Product), + ApplyRule( + URL + "/some-wrong-path", + use=DifferentUrlPage, + to_return=ItemWithPageObjectButForDifferentUrl, + ), + ApplyRule(URL, use=ParentProductPage, to_return=ProductFromParent), + ApplyRule(URL, use=SubclassProductPage, to_return=ProductFromParent), + ApplyRule(URL, use=IndependentA1Page, to_return=AItem), + ApplyRule(URL, use=IndependentA2Page, to_return=AItem), + ApplyRule( + Patterns([URL], priority=600), + use=PriorityParentProductPage, + to_return=PriorityProductFromParent, + ), + ApplyRule( + URL, + use=PrioritySubclassProductPage, + to_return=PriorityProductFromParent, + ), + ApplyRule( + Patterns([URL], priority=600), use=IndependentB1Page, to_return=BItem + ), + ApplyRule(URL, use=IndependentB2Page, to_return=BItem), + ApplyRule( + URL, + use=Priority2ParentProductPage, + to_return=PriorityProductFromSubclass, + ), + ApplyRule( + Patterns([URL], priority=600), + use=Priority2SubclassProductPage, + to_return=PriorityProductFromSubclass, + ), + ApplyRule(URL, use=IndependentC1Page, to_return=CItem), + ApplyRule( + Patterns([URL], priority=600), use=IndependentC2Page, to_return=CItem + ), + ApplyRule(URL, use=ReplacedProductPage, to_return=ReplacedProduct), + ApplyRule(URL, use=ParentReplacedProductPage, to_return=ParentReplacedProduct), + ApplyRule( + URL, use=SubclassReplacedProductPage, to_return=SubclassReplacedProduct + ), + ApplyRule(URL, use=StandaloneProductPage, to_return=StandaloneProduct), + ApplyRule( + URL, + # We're ignoring the typing here since it expects the argument for + # use to be a subclass of ItemPage. + use=ProductFromInjectablePage, # type: ignore[arg-type] + to_return=ProductFromInjectable, + ), + ApplyRule(URL, use=PageObjectDependencyPage), + ApplyRule( + URL, + use=ProductWithPageObjectDepPage, + to_return=MainProductA, + instead_of=ReplacedProductPageObjectDepPage, + ), + ApplyRule(URL, use=ItemDependencyPage, to_return=ItemDependency), + ApplyRule( + URL, + use=ProductWithItemDepPage, + to_return=MainProductB, + instead_of=ReplacedProductItemDepPage, + ), + ApplyRule(URL, use=ProductDeepDependencyPage, to_return=MainProductC), + ApplyRule(URL, use=ProductDuplicateDeepDependencyPage, to_return=MainProductD), + ApplyRule(URL, use=ChickenDeadlockPage, to_return=ChickenItem), + ApplyRule(URL, use=EggDeadlockPage, to_return=EggItem), + ApplyRule(URL, use=Chicken2DeadlockPage, to_return=Chicken2Item), + ApplyRule(URL, use=Egg2DeadlockPage, to_return=Egg2Item), + ] diff --git a/tox.ini b/tox.ini index 43ed1278..ce7f11b7 100644 --- a/tox.ini +++ b/tox.ini @@ -27,7 +27,7 @@ deps = scrapy==2.6.0 sqlitedict==1.5.0 url-matcher==0.2.0 - web-poet==0.6.0 + web-poet==0.7.0 [testenv:asyncio-min] basepython = python3.7