Merge 4881d6f into 83a3cd8

scrapinghub · Nov 21, 2022 · 5cbd70a · 5cbd70a
2 parents 83a3cd8 + 4881d6f
commit 5cbd70a
Show file tree

Hide file tree

Showing 28 changed files with 166 additions and 162 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,17 @@ TBR
 * Provider for ``web_poet.ResponseUrl`` is added, which allows to access the
   response URL in the page object. This triggers a download unlike the provider
   for ``web_poet.RequestUrl``.
+* Now requires web-poet >= 0.6.0.
+
+    * All examples in the docs and tests now use ``web_poet.WebPage``
+      instead of ``web_poet.ItemWebPage``.
+    * The new ``instead_of`` parameter of the ``@handle_urls`` decorator
+      is now preferred instead of the deprecated ``overrides`` parameter.
+    * ``web_poet.callback_for`` doesn't require an implemented ``to_item``
+      method anymore.
+    * The Registry now uses ``web_poet.ApplyRule`` instead of
+      ``web_poet.OverrideRule``.
+
 
 0.5.1 (2022-07-28)
 ------------------

diff --git a/docs/intro/advanced-tutorial.rst b/docs/intro/advanced-tutorial.rst
@@ -48,7 +48,7 @@ Suppose we have the following Page Object:
 
 
     @attr.define
-    class ProductPage(web_poet.ItemWebPage):
+    class ProductPage(web_poet.WebPage):
         http: web_poet.HttpClient
 
         async def to_item(self):
@@ -110,7 +110,7 @@ This basically acts as a switch to update the behavior of the Page Object:
 
 
     @attr.define
-    class ProductPage(web_poet.ItemWebPage):
+    class ProductPage(web_poet.WebPage):
         http: web_poet.HttpClient
         page_params: web_poet.PageParams
 

diff --git a/docs/intro/basic-tutorial.rst b/docs/intro/basic-tutorial.rst
@@ -65,10 +65,10 @@ out of the spider class.
 
 .. code-block:: python
 
-    from web_poet.pages import ItemWebPage
+    from web_poet.pages import WebPage
 
 
-    class BookPage(ItemWebPage):
+    class BookPage(WebPage):
         """Individual book page on books.toscrape.com website, e.g.
         http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html
         """
@@ -93,10 +93,10 @@ extract a property from the ``to_item`` method:
 
 .. code-block:: python
 
-    from web_poet.pages import ItemWebPage
+    from web_poet.pages import WebPage
 
 
-    class BookPage(ItemWebPage):
+    class BookPage(WebPage):
         """Individual book page on books.toscrape.com website"""
 
         @property
@@ -245,11 +245,11 @@ At the end of our job, the spider should look like this:
 .. code-block:: python
 
     import scrapy
-    from web_poet.pages import ItemWebPage
+    from web_poet.pages import WebPage
     from scrapy_poet import callback_for
 
 
-    class BookPage(ItemWebPage):
+    class BookPage(WebPage):
         """Individual book page on books.toscrape.com website, e.g.
         http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html
         """
@@ -353,7 +353,7 @@ existing Page Objects as subclasses of them:
 
 .. code-block:: python
 
-    from web_poet.pages import ItemWebPage, WebPage
+    from web_poet.pages import WebPage
 
 
     # ------ Base page objects ------
@@ -364,7 +364,7 @@ existing Page Objects as subclasses of them:
             return []
 
 
-    class BookPage(ItemWebPage):
+    class BookPage(WebPage):
 
         def to_item(self):
             return None
@@ -421,7 +421,7 @@ to implement new ones:
 
 .. code-block:: python
 
-    from web_poet.pages import ItemWebPage, WebPage
+    from web_poet.pages import WebPage
 
 
     class BPBookListPage(WebPage):
@@ -430,7 +430,7 @@ to implement new ones:
             return self.css("article.post h4 a::attr(href)").getall()
 
 
-    class BPBookPage(ItemWebPage):
+    class BPBookPage(WebPage):
 
         def to_item(self):
             return {
@@ -466,21 +466,21 @@ For example, the pattern ``books.toscrape.com/cataloge/category/``
 is accepted and it would restrict the override only to category pages.
 
 It is even possible to configure more complex patterns by using the
-:py:class:`web_poet.overrides.OverrideRule` class instead of a triplet in
+:py:class:`web_poet.rules.ApplyRule` class instead of a triplet in
 the configuration. Another way of declaring the earlier config
 for ``SCRAPY_POET_OVERRIDES`` would be the following:
 
 .. code-block:: python
 
     from url_matcher import Patterns
-    from web_poet import OverrideRule
+    from web_poet import ApplyRule
 
 
     SCRAPY_POET_OVERRIDES = [
-        OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage),
-        OverrideRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage),
-        OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage),
-        OverrideRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage),
+        ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookListPage, instead_of=BookListPage),
+        ApplyRule(for_patterns=Patterns(["toscrape.com"]), use=BTSBookPage, instead_of=BookPage),
+        ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookListPage, instead_of=BookListPage),
+        ApplyRule(for_patterns=Patterns(["bookpage.com"]), use=BPBookPage, instead_of=BookPage),
     ]
 
 As you can see, this could get verbose. The earlier tuple config simply offers
@@ -494,8 +494,8 @@ a shortcut to be more concise.
 Manually defining overrides like this would be inconvenient, most
 especially for larger projects. Fortunately, `web-poet`_ has a cool feature to
 annotate Page Objects like :py:func:`web_poet.handle_urls` that would define
-and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the
-:py:class:`web_poet.overrides.OverrideRule` rules could then be simply read as:
+and store the :py:class:`web_poet.rules.ApplyRule` for you. All of the
+:py:class:`web_poet.rules.ApplyRule` rules could then be simply read as:
 
 .. code:: python
 
@@ -505,14 +505,14 @@ and store the :py:class:`web_poet.overrides.OverrideRule` for you. All of the
     # rules from other packages. Otherwise, it can be omitted.
     # More info about this caveat on web-poet docs.
     consume_modules("external_package_A", "another_ext_package.lib")
-    SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
+    SCRAPY_POET_OVERRIDES = default_registry.get_rules()
 
 For more info on this, you can refer to these docs:
 
     * ``scrapy-poet``'s :ref:`overrides` Tutorial section.
     * External `web-poet`_ docs.
 
-        * Specifically, the :external:ref:`intro-overrides` Tutorial section.
+        * Specifically, the :external:ref:`rules-intro` Tutorial section.
 
 Next steps
 ==========

diff --git a/docs/overrides.rst b/docs/overrides.rst
@@ -15,10 +15,10 @@ page.
     - `Example 1 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_01.py>`_:
       rules using tuples
     - `Example 2 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_02.py>`_:
-      rules using tuples and :py:class:`web_poet.overrides.OverrideRule`
+      rules using tuples and :py:class:`web_poet.ApplyRule`
     - `Example 3 <https://github.com/scrapinghub/scrapy-poet/blob/master/example/example/spiders/books_04_overrides_03.py>`_:
       rules using :py:func:`web_poet.handle_urls` decorator and retrieving them
-      via :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides`
+      via :py:meth:`web_poet.rules.RulesRegistry.get_rules`
 
 Page Objects refinement
 =======================
@@ -44,7 +44,7 @@ using the following Page Object:
 
 .. code-block:: python
 
-    class ISBNBookPage(ItemWebPage):
+    class ISBNBookPage(WebPage):
 
         def __init__(self, response: HttpResponse, book_page: BookPage):
             super().__init__(response)
@@ -81,7 +81,7 @@ the obtained item with the ISBN from the page HTML.
     .. code-block:: python
 
         @attr.define
-        class ISBNBookPage(ItemWebPage):
+        class ISBNBookPage(WebPage):
             book_page: BookPage
 
             def to_item(self):
@@ -95,17 +95,17 @@ Overrides rules
 
 The default way of configuring the override rules is using triplets
 of the form (``url pattern``, ``override_type``, ``overridden_type``). But more
-complex rules can be introduced if the class :py:class:`web_poet.overrides.OverrideRule`
+complex rules can be introduced if the class :py:class:`web_poet.ApplyRule`
 is used. The following example configures an override that is only applied for
 book pages from ``books.toscrape.com``:
 
 .. code-block:: python
 
-    from web_poet import OverrideRule
+    from web_poet import ApplyRule
 
 
     SCRAPY_POET_OVERRIDES = [
-        OverrideRule(
+        ApplyRule(
             for_patterns=Patterns(
                 include=["books.toscrape.com/cataloge/*index.html|"],
                 exclude=["/catalogue/category/"]),
@@ -129,7 +129,7 @@ along with where it is applied. This can be done by decorating the
 Page Objects with :py:func:`web_poet.handle_urls` provided by `web-poet`_.
 
 .. tip::
-    Make sure to read the :external:ref:`intro-overrides` Tutorial section of
+    Make sure to read the :external:ref:`rules-intro` Tutorial section of
     `web-poet`_ to learn all of its other functionalities that is not covered
     in this section.
 
@@ -140,7 +140,7 @@ Let's see an example:
     from web_poet import handle_urls
 
 
-    @handle_urls("toscrape.com", BookPage)
+    @handle_urls("toscrape.com", instead_of=BookPage)
     class BTSBookPage(BookPage):
 
         def to_item(self):
@@ -155,7 +155,7 @@ for the domain ``toscrape.com``.
 
 In order to configure the ``scrapy-poet`` overrides automatically
 using these annotations, you can directly interact with `web-poet`_'s
-``default_registry`` (an instance of :py:class:`web_poet.overrides.PageObjectRegistry`).
+``default_registry`` (an instance of :py:class:`web_poet.rules.RulesRegistry`).
 
 For example:
 
@@ -169,21 +169,21 @@ For example:
     consume_modules("external_package_A", "another_ext_package.lib")
 
     # To get all of the Override Rules that were declared via annotations.
-    SCRAPY_POET_OVERRIDES = default_registry.get_overrides()
+    SCRAPY_POET_OVERRIDES = default_registry.get_rules()
 
-The :py:meth:`web_poet.overrides.PageObjectRegistry.get_overrides` method of the
-``default_registry`` above returns ``List[OverrideRule]`` that were declared
+The :py:meth:`web_poet.rules.RulesRegistry.get_rules` method of the
+``default_registry`` above returns ``List[ApplyRule]`` that were declared
 using `web-poet`_'s :py:func:`web_poet.handle_urls` annotation. This is much
-more convenient that manually defining all of the :py:class:`web_poet.overrides.OverrideRule`.
+more convenient that manually defining all of the :py:class:`web_poet.ApplyRule`.
 
 Take note that since ``SCRAPY_POET_OVERRIDES`` is structured as
-``List[OverrideRule]``, you can easily modify it later on if needed.
+``List[ApplyRule]``, you can easily modify it later on if needed.
 
 .. note::
 
     For more info and advanced features of `web-poet`_'s :py:func:`web_poet.handle_urls`
     and its registry, kindly read the `web-poet <https://web-poet.readthedocs.io>`_
-    documentation, specifically its :external:ref:`intro-overrides` tutorial
+    documentation, specifically its :external:ref:`rules-intro` tutorial
     section.
 
 

diff --git a/docs/providers.rst b/docs/providers.rst
@@ -22,7 +22,7 @@ Creating providers
 Providers are responsible for building dependencies needed by Injectable
 objects. A good example would be the ``HttpResponseProvider``,
 which builds and provides a ``web_poet.HttpResponse`` instance for Injectables
-that need it, like the ``web_poet.ItemWebPage``.
+that need it, like the ``web_poet.WebPage``.
 
 .. code-block:: python
 
@@ -271,7 +271,7 @@ Page Object uses it, the request is not ignored, for example:
 .. note::
 
     The code above is just for example purposes. If you need to use ``Response``
-    instances in your Page Objects, use built-in ``ItemWebPage`` - it has
+    instances in your Page Objects, use built-in ``WebPage`` - it has
     ``response`` attribute with ``HttpResponse``; no additional configuration
     is needed, as there is ``HttpResponseProvider`` enabled in ``scrapy-poet``
     by default.

diff --git a/example/example/spiders/books_02.py b/example/example/spiders/books_02.py
@@ -3,10 +3,10 @@
 BookPage is now independent of Scrapy.
 """
 import scrapy
-from web_poet import ItemWebPage
+from web_poet import WebPage
 
 
-class BookPage(ItemWebPage):
+class BookPage(WebPage):
     def to_item(self):
         return {
             "url": self.url,

diff --git a/example/example/spiders/books_02_1.py b/example/example/spiders/books_02_1.py
@@ -4,12 +4,12 @@
 boilerplate.
 """
 import scrapy
-from web_poet import ItemWebPage
+from web_poet import WebPage
 
 from scrapy_poet import callback_for
 
 
-class BookPage(ItemWebPage):
+class BookPage(WebPage):
     def to_item(self):
         return {
             "url": self.url,

diff --git a/example/example/spiders/books_02_2.py b/example/example/spiders/books_02_2.py
@@ -11,12 +11,12 @@
 it is better than defining callback explicitly.
 """
 import scrapy
-from web_poet import ItemWebPage
+from web_poet import WebPage
 
 from scrapy_poet import callback_for
 
 
-class BookPage(ItemWebPage):
+class BookPage(WebPage):
     def to_item(self):
         return {
             "url": self.url,

diff --git a/example/example/spiders/books_02_3.py b/example/example/spiders/books_02_3.py
@@ -8,10 +8,10 @@
 but it can be implemented, with Scrapy support.
 """
 import scrapy
-from web_poet import ItemWebPage
+from web_poet import WebPage
 
 
-class BookPage(ItemWebPage):
+class BookPage(WebPage):
     def to_item(self):
         return {
             "url": self.url,

diff --git a/example/example/spiders/books_04.py b/example/example/spiders/books_04.py
@@ -2,7 +2,7 @@
 Scrapy spider which uses Page Objects both for crawling and extraction.
 """
 import scrapy
-from web_poet import ItemWebPage, WebPage
+from web_poet import WebPage
 
 from scrapy_poet import callback_for
 
@@ -12,7 +12,7 @@ def book_urls(self):
         return self.css(".image_container a::attr(href)").getall()
 
 
-class BookPage(ItemWebPage):
+class BookPage(WebPage):
     def to_item(self):
         return {
             "url": self.url,

diff --git a/example/example/spiders/books_04_overrides_01.py b/example/example/spiders/books_04_overrides_01.py
@@ -6,7 +6,7 @@
 The default configured PO logic contains the logic for books.toscrape.com
 """
 import scrapy
-from web_poet import ItemWebPage, WebPage
+from web_poet import WebPage
 
 from scrapy_poet import callback_for
 
@@ -18,7 +18,7 @@ def book_urls(self):
         return self.css(".image_container a::attr(href)").getall()
 
 
-class BookPage(ItemWebPage):
+class BookPage(WebPage):
     """Logic to extract book info from pages like https://books.toscrape.com/catalogue/soumission_998/index.html"""
 
     def to_item(self):
@@ -35,7 +35,7 @@ def book_urls(self):
         return self.css("article.post h4 a::attr(href)").getall()
 
 
-class BPBookPage(ItemWebPage):
+class BPBookPage(WebPage):
     """Logic to extract from pages like https://bookpage.com/reviews/25879-laird-hunt-zorrie-fiction"""
 
     def to_item(self):