From aa8698c667ca94eb4a1ec5838bb2415604f83291 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 28 Sep 2022 20:10:28 +0800 Subject: [PATCH 01/46] update handle_urls() and OverrideRule to accept data types --- tests/po_lib/__init__.py | 3 + tests/po_lib/a_module.py | 1 + tests/po_lib/nested_package/__init__.py | 1 + .../po_lib/nested_package/a_nested_module.py | 1 + tests/po_lib_data_type/__init__.py | 160 ++++++++++++++++++ tests/po_lib_sub/__init__.py | 1 + tests/test_fields.py | 54 ++++++ tests/test_overrides.py | 46 ++++- .../po_lib_sub_not_imported/__init__.py | 1 + web_poet/_typing.py | 2 +- web_poet/overrides.py | 22 ++- 11 files changed, 285 insertions(+), 7 deletions(-) create mode 100644 tests/po_lib_data_type/__init__.py diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index b5545b60..d0ffea4f 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -14,6 +14,7 @@ class POBase(ItemPage): expected_overrides: Type[ItemPage] expected_patterns: Patterns + expected_data_type = None expected_meta: Dict[str, Any] @@ -33,6 +34,7 @@ class POTopLevelOverriden2(ItemPage): class POTopLevel1(POBase): expected_overrides = POTopLevelOverriden1 expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) + expected_data_type = None expected_meta = {} # type: ignore @@ -40,4 +42,5 @@ class POTopLevel1(POBase): class POTopLevel2(POBase): expected_overrides = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) + expected_data_type = None expected_meta = {} # type: ignore diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py index c3e7810d..0cbf4505 100644 --- a/tests/po_lib/a_module.py +++ b/tests/po_lib/a_module.py @@ -12,4 +12,5 @@ class POModuleOverriden(ItemPage): class POModule(POBase): expected_overrides = POModuleOverriden expected_patterns = Patterns(["example.com"]) + expected_data_type = None expected_meta = {"extra_arg": "foo"} # type: ignore diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py index 64be2384..b527b0d0 100644 --- a/tests/po_lib/nested_package/__init__.py +++ b/tests/po_lib/nested_package/__init__.py @@ -16,4 +16,5 @@ class PONestedPkgOverriden(ItemPage): class PONestedPkg(POBase): expected_overrides = PONestedPkgOverriden expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) + expected_data_type = None expected_meta = {} # type: ignore diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py index fc2e837e..48164e34 100644 --- a/tests/po_lib/nested_package/a_nested_module.py +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -18,4 +18,5 @@ class PONestedModule(POBase): expected_patterns = Patterns( include=["example.com", "example.org"], exclude=["/*.jpg|"] ) + expected_data_type = None expected_meta = {} # type: ignore diff --git a/tests/po_lib_data_type/__init__.py b/tests/po_lib_data_type/__init__.py new file mode 100644 index 00000000..1fb8c602 --- /dev/null +++ b/tests/po_lib_data_type/__init__.py @@ -0,0 +1,160 @@ +import attrs +from url_matcher import Patterns + +from web_poet import Injectable, ItemPage, Returns, field, handle_urls, item_from_fields + + +@attrs.define +class Product: + name: str + price: float + + +@attrs.define +class ProductSimilar: + name: str + price: float + + +@attrs.define +class ProductMoreFields(Product): + brand: str + + +@attrs.define +class ProductLessFields: + name: str + + +@handle_urls("example.com") +class ProductPage(ItemPage[Product]): + """A base PO to populate the Product item's fields.""" + + expected_overrides = None + expected_patterns = Patterns(["example.com"]) + expected_data_type = Product + expected_meta = {} + + @field + def name(self) -> str: + return "name" + + @field + def price(self) -> float: + return 12.99 + + +@handle_urls("example.com", overrides=ProductPage) +class ImprovedProductPage(ProductPage): + """A custom PO inheriting from a base PO which alters some field values.""" + + expected_overrides = ProductPage + expected_patterns = Patterns(["example.com"]) + expected_data_type = Product + expected_meta = {} + + @field + def name(self) -> str: + return "improved name" + + +@handle_urls("example.com", overrides=ProductPage) +class SimilarProductPage(ProductPage, Returns[ProductSimilar]): + """A custom PO inheriting from a base PO returning the same fields but in + a different item class. + """ + + expected_overrides = ProductPage + expected_patterns = Patterns(["example.com"]) + expected_data_type = ProductSimilar + expected_meta = {} + + +@handle_urls("example.com", overrides=ProductPage) +class MoreProductPage(ProductPage, Returns[ProductMoreFields]): + """A custom PO inheriting from a base PO returning more items using a + different item class. + """ + + expected_overrides = ProductPage + expected_patterns = Patterns(["example.com"]) + expected_data_type = ProductMoreFields + expected_meta = {} + + @field + def brand(self) -> str: + return "brand" + + +@handle_urls("example.com", overrides=ProductPage) +class LessProductPage( + ProductPage, Returns[ProductLessFields], skip_nonitem_fields=True +): + """A custom PO inheriting from a base PO returning less items using a + different item class. + """ + + expected_overrides = ProductPage + expected_patterns = Patterns(["example.com"]) + expected_data_type = ProductLessFields + expected_meta = {} + + @field + def brand(self) -> str: + return "brand" + + +@handle_urls("example.com", overrides=ProductPage, data_type=ProductSimilar) +class CustomProductPage(ProductPage, Returns[Product]): + """A custom PO inheriting from a base PO returning the same fields but in + a different item class. + + This PO is the same with ``SimilarProductPage`` but passes a ``data_type`` + in the ``@handle_urls`` decorator. + + This tests the case that the type inside ``Returns`` should be followed and + the ``data_type`` parameter from ``@handle_urls`` is ignored. + """ + + expected_overrides = ProductPage + expected_patterns = Patterns(["example.com"]) + expected_data_type = Product + expected_meta = {} + + +@handle_urls("example.com", overrides=ProductPage, data_type=ProductSimilar) +class CustomProductPageNoReturns(ProductPage): + """Same case as with ``CustomProductPage`` but doesn't inherit from + ``Returns[Product]``. + """ + + expected_overrides = ProductPage + expected_patterns = Patterns(["example.com"]) + expected_data_type = Product + expected_meta = {} + + +@handle_urls("example.com", data_type=Product) +class CustomProductPageDataTypeOnly(Injectable): + """A PO that doesn't inherit from ``ItemPage`` and ``WebPage`` which means + it doesn't inherit from the ``Returns`` class. + + This tests the case that the ``data_Type`` parameter in ``@handle_urls`` + should properly use it in the rules. + """ + + expected_overrides = None + expected_patterns = Patterns(["example.com"]) + expected_data_type = Product + expected_meta = {} + + @field + def name(self) -> str: + return "name" + + @field + def price(self) -> float: + return 12.99 + + async def to_item(self) -> Product: + return await item_from_fields(self, item_cls=Product) diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py index 442836c3..3051df42 100644 --- a/tests/po_lib_sub/__init__.py +++ b/tests/po_lib_sub/__init__.py @@ -22,4 +22,5 @@ class POLibSubOverriden(ItemPage): class POLibSub(POBase): expected_overrides = POLibSubOverriden expected_patterns = Patterns(["sub_example.com"]) + expected_data_type = None expected_meta = {} # type: ignore diff --git a/tests/test_fields.py b/tests/test_fields.py index 66c8e357..9f1fd4f5 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -4,6 +4,20 @@ import attrs import pytest +from tests.po_lib_data_type import ( + CustomProductPage, + CustomProductPageDataTypeOnly, + CustomProductPageNoReturns, + ImprovedProductPage, + LessProductPage, + MoreProductPage, + Product, + ProductLessFields, + ProductMoreFields, + ProductPage, + ProductSimilar, + SimilarProductPage, +) from web_poet import ( HttpResponse, Injectable, @@ -368,3 +382,43 @@ def field_foo_cached(self): assert page.field_foo == "foo" assert page.field_foo_meta == "foo" assert page.field_foo_cached == "foo" + + +@pytest.mark.asyncio +async def test_field_with_handle_urls() -> None: + + page = ProductPage() + assert page.name == "name" + assert page.price == 12.99 + assert await page.to_item() == Product(name="name", price=12.99) + + page = ImprovedProductPage() + assert page.name == "improved name" + assert page.price == 12.99 + assert await page.to_item() == Product(name="improved name", price=12.99) + + page = SimilarProductPage() + assert page.name == "name" + assert page.price == 12.99 + assert await page.to_item() == ProductSimilar(name="name", price=12.99) + + page = MoreProductPage() + assert page.name == "name" + assert page.price == 12.99 + assert page.brand == "brand" + assert await page.to_item() == ProductMoreFields( + name="name", price=12.99, brand="brand" + ) + + page = LessProductPage() + assert page.name == "name" + assert await page.to_item() == ProductLessFields(name="name") + + for page in [ # type: ignore[assignment] + CustomProductPage(), + CustomProductPageNoReturns(), + CustomProductPageDataTypeOnly(), + ]: + assert page.name == "name" + assert page.price == 12.99 + assert await page.to_item() == Product(name="name", price=12.99) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 2a9d09f5..a8d52a90 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -5,10 +5,36 @@ from tests.po_lib.a_module import POModule, POModuleOverriden from tests.po_lib.nested_package import PONestedPkg from tests.po_lib.nested_package.a_nested_module import PONestedModule +from tests.po_lib_data_type import ( + CustomProductPage, + CustomProductPageDataTypeOnly, + CustomProductPageNoReturns, + ImprovedProductPage, + LessProductPage, + MoreProductPage, + Product, + ProductPage, + ProductSimilar, + SimilarProductPage, +) from tests.po_lib_sub import POLibSub from web_poet import OverrideRule, PageObjectRegistry, consume_modules, default_registry -POS = {POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule} +POS = { + POTopLevel1, + POTopLevel2, + POModule, + PONestedPkg, + PONestedModule, + ProductPage, + ImprovedProductPage, + SimilarProductPage, + MoreProductPage, + LessProductPage, + CustomProductPage, + CustomProductPageNoReturns, + CustomProductPageDataTypeOnly, +} def test_override_rule_uniqueness() -> None: @@ -30,9 +56,24 @@ def test_override_rule_uniqueness() -> None: instead_of=POTopLevelOverriden2, meta={"key_2": 2}, ) - + # The ``meta`` parameter is ignored in the hash. assert hash(rule1) == hash(rule2) + rule1 = OverrideRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + data_type=Product, + ) + rule2 = OverrideRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + data_type=ProductSimilar, + ) + # A different data type class results in different hash. + assert hash(rule1) != hash(rule2) + def test_list_page_objects_all() -> None: rules = default_registry.get_overrides() @@ -57,6 +98,7 @@ def test_list_page_objects_all() -> None: # which doesn't contain the ``expected_*`` fields in our tests. assert rule.instead_of == rule.use.expected_overrides, rule.use # type: ignore[attr-defined] assert rule.for_patterns == rule.use.expected_patterns, rule.use # type: ignore[attr-defined] + assert rule.data_type == rule.use.expected_data_type, rule.use # type: ignore[attr-defined] assert rule.meta == rule.use.expected_meta, rule.use # type: ignore[attr-defined] diff --git a/tests_extra/po_lib_sub_not_imported/__init__.py b/tests_extra/po_lib_sub_not_imported/__init__.py index a3c6f9d9..97f5b6cd 100644 --- a/tests_extra/po_lib_sub_not_imported/__init__.py +++ b/tests_extra/po_lib_sub_not_imported/__init__.py @@ -25,4 +25,5 @@ class POLibSubOverridenNotImported: class POLibSubNotImported(POBase): expected_overrides = POLibSubOverridenNotImported expected_patterns = Patterns(["sub_example_not_imported.com"]) + expected_data_type = None expected_meta = {} # type: ignore diff --git a/web_poet/_typing.py b/web_poet/_typing.py index a5ec9fed..0c96e2ff 100644 --- a/web_poet/_typing.py +++ b/web_poet/_typing.py @@ -18,7 +18,7 @@ def is_generic_alias(obj) -> bool: def get_generic_parameter(cls): - for base in cls.__orig_bases__: + for base in getattr(cls, "__orig_bases__", []): if is_generic_alias(base): args = _get_args(base) return args[0] diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 3727d719..a806aefa 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -11,7 +11,8 @@ from url_matcher import Patterns -from web_poet.pages import ItemPage +from web_poet._typing import get_generic_parameter +from web_poet.pages import ItemPage, ItemT from web_poet.utils import as_list Strings = Union[str, Iterable[str]] @@ -37,6 +38,7 @@ class OverrideRule: about the patterns. * ``use`` - The Page Object that will be **used**. * ``instead_of`` - The Page Object that will be **replaced**. + * ``data_type`` - The data container class that the Page Object returns. * ``meta`` - Any other information you may want to store. This doesn't do anything for now but may be useful for future API updates. @@ -48,11 +50,12 @@ class OverrideRule: for_patterns: Patterns use: Type[ItemPage] - instead_of: Type[ItemPage] + instead_of: Optional[Type[ItemPage]] = None + data_type: Optional[Type[Any]] = None meta: Dict[str, Any] = field(default_factory=dict) def __hash__(self): - return hash((self.for_patterns, self.use, self.instead_of)) + return hash((self.for_patterns, self.use, self.instead_of, self.data_type)) class PageObjectRegistry(dict): @@ -112,7 +115,8 @@ def handle_urls( self, include: Strings, *, - overrides: Type[ItemPage], + overrides: Optional[Type[ItemPage]] = None, + data_type: Optional[Type] = None, exclude: Optional[Strings] = None, priority: int = 500, **kwargs, @@ -133,6 +137,10 @@ def handle_urls( :param include: The URLs that should be handled by the decorated Page Object. :param overrides: The Page Object that should be `replaced`. + :param data_type: The Item Class holding the data returned by the Page Object. + This parameter is ignored if the Page Object has declared a data type + using :class:`~.Returns` or :class:`~.ItemPage` + (e.g. ``class ProductPageObject(ItemPage[ProductItem])``). :param exclude: The URLs over which the override should **not** happen. :param priority: The resolution priority in case of `conflicting` rules. A conflict happens when the ``include``, ``override``, and ``exclude`` @@ -141,6 +149,11 @@ def handle_urls( """ def wrapper(cls): + final_data_type = data_type + derived_type = get_generic_parameter(cls) + if derived_type and derived_type != ItemT: + final_data_type = derived_type + rule = OverrideRule( for_patterns=Patterns( include=as_list(include), @@ -149,6 +162,7 @@ def wrapper(cls): ), use=cls, instead_of=overrides, + data_type=final_data_type, meta=kwargs, ) # If it was already defined, we don't want to override it From cc5ec1773bd5fea193af4d06f9f921cef78a3a85 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 28 Sep 2022 20:11:24 +0800 Subject: [PATCH 02/46] update flake8 to ignore D102 in tests/po_lib_data_type --- .flake8 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.flake8 b/.flake8 index a326177c..949c31dd 100644 --- a/.flake8 +++ b/.flake8 @@ -34,5 +34,7 @@ per-file-ignores = # imports are there to expose submodule functions so they can be imported # directly from that module # F403: Ignore * imports in these files + # D102: Missing docstring in public method web_poet/__init__.py:F401,F403 web_poet/page_inputs/__init__.py:F401,F403 + tests/po_lib_data_type/__init__.py:D102 From c135b2d36c77188deab1650932487736448ea4f7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 28 Sep 2022 20:11:47 +0800 Subject: [PATCH 03/46] update mypy config to ignore tests.po_lib_data_type --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 892b5aab..bfda6e47 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,3 +6,7 @@ multi_line_output = 3 show_error_codes = true ignore_missing_imports = true no_warn_no_return = true + +[[tool.mypy.overrides]] +module = "tests.po_lib_data_type.*" +ignore_errors = true From 7837b3cde03760060d9d1907b90f20f5d63048e2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 14:37:40 +0800 Subject: [PATCH 04/46] rename 'data_type' into 'to_return' in handle_urls() and OverrideRule --- .flake8 | 2 +- pyproject.toml | 2 +- tests/po_lib/__init__.py | 6 ++-- tests/po_lib/a_module.py | 2 +- tests/po_lib/nested_package/__init__.py | 2 +- .../po_lib/nested_package/a_nested_module.py | 2 +- tests/po_lib_sub/__init__.py | 2 +- .../__init__.py | 28 +++++++++---------- tests/test_fields.py | 2 +- tests/test_overrides.py | 10 +++---- .../po_lib_sub_not_imported/__init__.py | 2 +- web_poet/overrides.py | 14 +++++----- 12 files changed, 37 insertions(+), 37 deletions(-) rename tests/{po_lib_data_type => po_lib_to_return}/__init__.py (85%) diff --git a/.flake8 b/.flake8 index 949c31dd..726947fe 100644 --- a/.flake8 +++ b/.flake8 @@ -37,4 +37,4 @@ per-file-ignores = # D102: Missing docstring in public method web_poet/__init__.py:F401,F403 web_poet/page_inputs/__init__.py:F401,F403 - tests/po_lib_data_type/__init__.py:D102 + tests/po_lib_to_return/__init__.py:D102 diff --git a/pyproject.toml b/pyproject.toml index bfda6e47..6cf3bce2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,5 +8,5 @@ ignore_missing_imports = true no_warn_no_return = true [[tool.mypy.overrides]] -module = "tests.po_lib_data_type.*" +module = "tests.po_lib_to_return.*" ignore_errors = true diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index d0ffea4f..bb21e2d7 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -14,7 +14,7 @@ class POBase(ItemPage): expected_overrides: Type[ItemPage] expected_patterns: Patterns - expected_data_type = None + expected_to_return = None expected_meta: Dict[str, Any] @@ -34,7 +34,7 @@ class POTopLevelOverriden2(ItemPage): class POTopLevel1(POBase): expected_overrides = POTopLevelOverriden1 expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) - expected_data_type = None + expected_to_return = None expected_meta = {} # type: ignore @@ -42,5 +42,5 @@ class POTopLevel1(POBase): class POTopLevel2(POBase): expected_overrides = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) - expected_data_type = None + expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py index 0cbf4505..a9e0cf2d 100644 --- a/tests/po_lib/a_module.py +++ b/tests/po_lib/a_module.py @@ -12,5 +12,5 @@ class POModuleOverriden(ItemPage): class POModule(POBase): expected_overrides = POModuleOverriden expected_patterns = Patterns(["example.com"]) - expected_data_type = None + expected_to_return = None expected_meta = {"extra_arg": "foo"} # type: ignore diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py index b527b0d0..37064cdd 100644 --- a/tests/po_lib/nested_package/__init__.py +++ b/tests/po_lib/nested_package/__init__.py @@ -16,5 +16,5 @@ class PONestedPkgOverriden(ItemPage): class PONestedPkg(POBase): expected_overrides = PONestedPkgOverriden expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) - expected_data_type = None + expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py index 48164e34..35c42727 100644 --- a/tests/po_lib/nested_package/a_nested_module.py +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -18,5 +18,5 @@ class PONestedModule(POBase): expected_patterns = Patterns( include=["example.com", "example.org"], exclude=["/*.jpg|"] ) - expected_data_type = None + expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py index 3051df42..6806c70b 100644 --- a/tests/po_lib_sub/__init__.py +++ b/tests/po_lib_sub/__init__.py @@ -22,5 +22,5 @@ class POLibSubOverriden(ItemPage): class POLibSub(POBase): expected_overrides = POLibSubOverriden expected_patterns = Patterns(["sub_example.com"]) - expected_data_type = None + expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib_data_type/__init__.py b/tests/po_lib_to_return/__init__.py similarity index 85% rename from tests/po_lib_data_type/__init__.py rename to tests/po_lib_to_return/__init__.py index 1fb8c602..bdde8104 100644 --- a/tests/po_lib_data_type/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -32,7 +32,7 @@ class ProductPage(ItemPage[Product]): expected_overrides = None expected_patterns = Patterns(["example.com"]) - expected_data_type = Product + expected_to_return = Product expected_meta = {} @field @@ -50,7 +50,7 @@ class ImprovedProductPage(ProductPage): expected_overrides = ProductPage expected_patterns = Patterns(["example.com"]) - expected_data_type = Product + expected_to_return = Product expected_meta = {} @field @@ -66,7 +66,7 @@ class SimilarProductPage(ProductPage, Returns[ProductSimilar]): expected_overrides = ProductPage expected_patterns = Patterns(["example.com"]) - expected_data_type = ProductSimilar + expected_to_return = ProductSimilar expected_meta = {} @@ -78,7 +78,7 @@ class MoreProductPage(ProductPage, Returns[ProductMoreFields]): expected_overrides = ProductPage expected_patterns = Patterns(["example.com"]) - expected_data_type = ProductMoreFields + expected_to_return = ProductMoreFields expected_meta = {} @field @@ -96,7 +96,7 @@ class LessProductPage( expected_overrides = ProductPage expected_patterns = Patterns(["example.com"]) - expected_data_type = ProductLessFields + expected_to_return = ProductLessFields expected_meta = {} @field @@ -104,25 +104,25 @@ def brand(self) -> str: return "brand" -@handle_urls("example.com", overrides=ProductPage, data_type=ProductSimilar) +@handle_urls("example.com", overrides=ProductPage, to_return=ProductSimilar) class CustomProductPage(ProductPage, Returns[Product]): """A custom PO inheriting from a base PO returning the same fields but in a different item class. - This PO is the same with ``SimilarProductPage`` but passes a ``data_type`` + This PO is the same with ``SimilarProductPage`` but passes a ``to_return`` in the ``@handle_urls`` decorator. This tests the case that the type inside ``Returns`` should be followed and - the ``data_type`` parameter from ``@handle_urls`` is ignored. + the ``to_return`` parameter from ``@handle_urls`` is ignored. """ expected_overrides = ProductPage expected_patterns = Patterns(["example.com"]) - expected_data_type = Product + expected_to_return = Product expected_meta = {} -@handle_urls("example.com", overrides=ProductPage, data_type=ProductSimilar) +@handle_urls("example.com", overrides=ProductPage, to_return=ProductSimilar) class CustomProductPageNoReturns(ProductPage): """Same case as with ``CustomProductPage`` but doesn't inherit from ``Returns[Product]``. @@ -130,22 +130,22 @@ class CustomProductPageNoReturns(ProductPage): expected_overrides = ProductPage expected_patterns = Patterns(["example.com"]) - expected_data_type = Product + expected_to_return = Product expected_meta = {} -@handle_urls("example.com", data_type=Product) +@handle_urls("example.com", to_return=Product) class CustomProductPageDataTypeOnly(Injectable): """A PO that doesn't inherit from ``ItemPage`` and ``WebPage`` which means it doesn't inherit from the ``Returns`` class. - This tests the case that the ``data_Type`` parameter in ``@handle_urls`` + This tests the case that the ``to_return`` parameter in ``@handle_urls`` should properly use it in the rules. """ expected_overrides = None expected_patterns = Patterns(["example.com"]) - expected_data_type = Product + expected_to_return = Product expected_meta = {} @field diff --git a/tests/test_fields.py b/tests/test_fields.py index 9f1fd4f5..38086129 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -4,7 +4,7 @@ import attrs import pytest -from tests.po_lib_data_type import ( +from tests.po_lib_to_return import ( CustomProductPage, CustomProductPageDataTypeOnly, CustomProductPageNoReturns, diff --git a/tests/test_overrides.py b/tests/test_overrides.py index a8d52a90..51491df6 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -5,7 +5,8 @@ from tests.po_lib.a_module import POModule, POModuleOverriden from tests.po_lib.nested_package import PONestedPkg from tests.po_lib.nested_package.a_nested_module import PONestedModule -from tests.po_lib_data_type import ( +from tests.po_lib_sub import POLibSub +from tests.po_lib_to_return import ( CustomProductPage, CustomProductPageDataTypeOnly, CustomProductPageNoReturns, @@ -17,7 +18,6 @@ ProductSimilar, SimilarProductPage, ) -from tests.po_lib_sub import POLibSub from web_poet import OverrideRule, PageObjectRegistry, consume_modules, default_registry POS = { @@ -63,13 +63,13 @@ def test_override_rule_uniqueness() -> None: for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden2, - data_type=Product, + to_return=Product, ) rule2 = OverrideRule( for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden2, - data_type=ProductSimilar, + to_return=ProductSimilar, ) # A different data type class results in different hash. assert hash(rule1) != hash(rule2) @@ -98,7 +98,7 @@ def test_list_page_objects_all() -> None: # which doesn't contain the ``expected_*`` fields in our tests. assert rule.instead_of == rule.use.expected_overrides, rule.use # type: ignore[attr-defined] assert rule.for_patterns == rule.use.expected_patterns, rule.use # type: ignore[attr-defined] - assert rule.data_type == rule.use.expected_data_type, rule.use # type: ignore[attr-defined] + assert rule.to_return == rule.use.expected_to_return, rule.use # type: ignore[attr-defined] assert rule.meta == rule.use.expected_meta, rule.use # type: ignore[attr-defined] diff --git a/tests_extra/po_lib_sub_not_imported/__init__.py b/tests_extra/po_lib_sub_not_imported/__init__.py index 97f5b6cd..1c194513 100644 --- a/tests_extra/po_lib_sub_not_imported/__init__.py +++ b/tests_extra/po_lib_sub_not_imported/__init__.py @@ -25,5 +25,5 @@ class POLibSubOverridenNotImported: class POLibSubNotImported(POBase): expected_overrides = POLibSubOverridenNotImported expected_patterns = Patterns(["sub_example_not_imported.com"]) - expected_data_type = None + expected_to_return = None expected_meta = {} # type: ignore diff --git a/web_poet/overrides.py b/web_poet/overrides.py index a806aefa..30ae1ee3 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -38,7 +38,7 @@ class OverrideRule: about the patterns. * ``use`` - The Page Object that will be **used**. * ``instead_of`` - The Page Object that will be **replaced**. - * ``data_type`` - The data container class that the Page Object returns. + * ``to_return`` - The data container class that the Page Object returns. * ``meta`` - Any other information you may want to store. This doesn't do anything for now but may be useful for future API updates. @@ -51,11 +51,11 @@ class OverrideRule: for_patterns: Patterns use: Type[ItemPage] instead_of: Optional[Type[ItemPage]] = None - data_type: Optional[Type[Any]] = None + to_return: Optional[Type[Any]] = None meta: Dict[str, Any] = field(default_factory=dict) def __hash__(self): - return hash((self.for_patterns, self.use, self.instead_of, self.data_type)) + return hash((self.for_patterns, self.use, self.instead_of, self.to_return)) class PageObjectRegistry(dict): @@ -116,7 +116,7 @@ def handle_urls( include: Strings, *, overrides: Optional[Type[ItemPage]] = None, - data_type: Optional[Type] = None, + to_return: Optional[Type] = None, exclude: Optional[Strings] = None, priority: int = 500, **kwargs, @@ -137,7 +137,7 @@ def handle_urls( :param include: The URLs that should be handled by the decorated Page Object. :param overrides: The Page Object that should be `replaced`. - :param data_type: The Item Class holding the data returned by the Page Object. + :param to_return: The Item Class holding the data returned by the Page Object. This parameter is ignored if the Page Object has declared a data type using :class:`~.Returns` or :class:`~.ItemPage` (e.g. ``class ProductPageObject(ItemPage[ProductItem])``). @@ -149,7 +149,7 @@ def handle_urls( """ def wrapper(cls): - final_data_type = data_type + final_data_type = to_return derived_type = get_generic_parameter(cls) if derived_type and derived_type != ItemT: final_data_type = derived_type @@ -162,7 +162,7 @@ def wrapper(cls): ), use=cls, instead_of=overrides, - data_type=final_data_type, + to_return=final_data_type, meta=kwargs, ) # If it was already defined, we don't want to override it From eb7e1b20404ffdef77a3c5b0e951e7fabb3b862a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 15:50:05 +0800 Subject: [PATCH 05/46] rename 'overrides' into 'instead_of' in @handle_urls --- docs/advanced/fields.rst | 2 +- docs/intro/overrides.rst | 10 +++---- tests/po_lib/__init__.py | 12 ++++---- tests/po_lib/a_module.py | 4 +-- tests/po_lib/nested_package/__init__.py | 4 +-- .../po_lib/nested_package/a_nested_module.py | 4 +-- tests/po_lib_sub/__init__.py | 6 ++-- tests/po_lib_to_return/__init__.py | 28 ++++++++--------- tests/test_overrides.py | 30 +++++++++++++++++-- .../po_lib_sub_not_imported/__init__.py | 6 ++-- web_poet/overrides.py | 18 +++++++---- 11 files changed, 79 insertions(+), 45 deletions(-) diff --git a/docs/advanced/fields.rst b/docs/advanced/fields.rst index 0411571f..f0b7c827 100644 --- a/docs/advanced/fields.rst +++ b/docs/advanced/fields.rst @@ -319,7 +319,7 @@ inherit from the "base", "standard" Page Object, there could be a ``@field`` from the base class which is not present in the ``CustomItem``. It'd be still passed to ``CustomItem.__init__``, causing an exception. -One way to solve it is to make the orignal Page Object a dependency +One way to solve it is to make the original Page Object a dependency instead of inheriting from it, as explained in the beginning. Alternatively, you can use ``skip_nonitem_fields=True`` class argument - it tells diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 0939e13b..056e617e 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -62,19 +62,19 @@ Let's take a look at how the following code is structured: return {"product-title": self.css("title::text").get()} - @handle_urls("example.com", overrides=GenericProductPage) + @handle_urls("example.com", instead_of=GenericProductPage) class ExampleProductPage(WebPage): def to_item(self): ... # more specific parsing - @handle_urls("anotherexample.com", overrides=GenericProductPage, exclude="/digital-goods/") + @handle_urls("anotherexample.com", instead_of=GenericProductPage, exclude="/digital-goods/") class AnotherExampleProductPage(WebPage): def to_item(self): ... # more specific parsing - @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], overrides=GenericProductPage) + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], instead_of=GenericProductPage) class DualExampleProductPage(WebPage): def to_item(self): ... # more specific parsing @@ -294,7 +294,7 @@ have the first approach as an example: # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - @handle_urls("site_1.com", overrides=ecommerce_page_objects.EcomGenericPage, priority=1000) + @handle_urls("site_1.com", instead_of=ecommerce_page_objects.EcomGenericPage, priority=1000) class ImprovedEcomSite1(ecommerce_page_objects.site_1.EcomSite1): def to_item(self): ... # call super().to_item() and improve on the item's shortcomings @@ -399,7 +399,7 @@ Here's an example: from web_poet import default_registry, consume_modules, handle_urls import ecommerce_page_objects, gadget_sites_page_objects, common_items - @handle_urls("site_2.com", overrides=common_items.ProductGenericPage, priority=1000) + @handle_urls("site_2.com", instead_of=common_items.ProductGenericPage, priority=1000) class EcomSite2Copy(ecommerce_page_objects.site_1.EcomSite1): def to_item(self): return super().to_item() diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index bb21e2d7..91c8397e 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -12,7 +12,7 @@ class POBase(ItemPage): - expected_overrides: Type[ItemPage] + expected_instead_of: Type[ItemPage] expected_patterns: Patterns expected_to_return = None expected_meta: Dict[str, Any] @@ -27,20 +27,20 @@ class POTopLevelOverriden2(ItemPage): # This first annotation is ignored. A single annotation per registry is allowed -@handle_urls("example.com", overrides=POTopLevelOverriden1) +@handle_urls("example.com", instead_of=POTopLevelOverriden1) @handle_urls( - "example.com", overrides=POTopLevelOverriden1, exclude="/*.jpg|", priority=300 + "example.com", instead_of=POTopLevelOverriden1, exclude="/*.jpg|", priority=300 ) class POTopLevel1(POBase): - expected_overrides = POTopLevelOverriden1 + expected_instead_of = POTopLevelOverriden1 expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) expected_to_return = None expected_meta = {} # type: ignore -@handle_urls("example.com", overrides=POTopLevelOverriden2) +@handle_urls("example.com", instead_of=POTopLevelOverriden2) class POTopLevel2(POBase): - expected_overrides = POTopLevelOverriden2 + expected_instead_of = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py index a9e0cf2d..941f48f6 100644 --- a/tests/po_lib/a_module.py +++ b/tests/po_lib/a_module.py @@ -8,9 +8,9 @@ class POModuleOverriden(ItemPage): ... -@handle_urls("example.com", overrides=POModuleOverriden, extra_arg="foo") +@handle_urls("example.com", instead_of=POModuleOverriden, extra_arg="foo") class POModule(POBase): - expected_overrides = POModuleOverriden + expected_instead_of = POModuleOverriden expected_patterns = Patterns(["example.com"]) expected_to_return = None expected_meta = {"extra_arg": "foo"} # type: ignore diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py index 37064cdd..63547ea4 100644 --- a/tests/po_lib/nested_package/__init__.py +++ b/tests/po_lib/nested_package/__init__.py @@ -11,10 +11,10 @@ class PONestedPkgOverriden(ItemPage): @handle_urls( include=["example.com", "example.org"], exclude=["/*.jpg|"], - overrides=PONestedPkgOverriden, + instead_of=PONestedPkgOverriden, ) class PONestedPkg(POBase): - expected_overrides = PONestedPkgOverriden + expected_instead_of = PONestedPkgOverriden expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py index 35c42727..5d44b39c 100644 --- a/tests/po_lib/nested_package/a_nested_module.py +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -11,10 +11,10 @@ class PONestedModuleOverriden(ItemPage): @handle_urls( include=["example.com", "example.org"], exclude=["/*.jpg|"], - overrides=PONestedModuleOverriden, + instead_of=PONestedModuleOverriden, ) class PONestedModule(POBase): - expected_overrides = PONestedModuleOverriden + expected_instead_of = PONestedModuleOverriden expected_patterns = Patterns( include=["example.com", "example.org"], exclude=["/*.jpg|"] ) diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py index 6806c70b..4889ef38 100644 --- a/tests/po_lib_sub/__init__.py +++ b/tests/po_lib_sub/__init__.py @@ -9,7 +9,7 @@ class POBase(ItemPage): - expected_overrides: Type[ItemPage] + expected_instead_of: Type[ItemPage] expected_patterns: Patterns expected_meta: Dict[str, Any] @@ -18,9 +18,9 @@ class POLibSubOverriden(ItemPage): ... -@handle_urls("sub_example.com", overrides=POLibSubOverriden) +@handle_urls("sub_example.com", instead_of=POLibSubOverriden) class POLibSub(POBase): - expected_overrides = POLibSubOverriden + expected_instead_of = POLibSubOverriden expected_patterns = Patterns(["sub_example.com"]) expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index bdde8104..c6bcd752 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -30,7 +30,7 @@ class ProductLessFields: class ProductPage(ItemPage[Product]): """A base PO to populate the Product item's fields.""" - expected_overrides = None + expected_instead_of = None expected_patterns = Patterns(["example.com"]) expected_to_return = Product expected_meta = {} @@ -44,11 +44,11 @@ def price(self) -> float: return 12.99 -@handle_urls("example.com", overrides=ProductPage) +@handle_urls("example.com", instead_of=ProductPage) class ImprovedProductPage(ProductPage): """A custom PO inheriting from a base PO which alters some field values.""" - expected_overrides = ProductPage + expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = Product expected_meta = {} @@ -58,25 +58,25 @@ def name(self) -> str: return "improved name" -@handle_urls("example.com", overrides=ProductPage) +@handle_urls("example.com", instead_of=ProductPage) class SimilarProductPage(ProductPage, Returns[ProductSimilar]): """A custom PO inheriting from a base PO returning the same fields but in a different item class. """ - expected_overrides = ProductPage + expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductSimilar expected_meta = {} -@handle_urls("example.com", overrides=ProductPage) +@handle_urls("example.com", instead_of=ProductPage) class MoreProductPage(ProductPage, Returns[ProductMoreFields]): """A custom PO inheriting from a base PO returning more items using a different item class. """ - expected_overrides = ProductPage + expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductMoreFields expected_meta = {} @@ -86,7 +86,7 @@ def brand(self) -> str: return "brand" -@handle_urls("example.com", overrides=ProductPage) +@handle_urls("example.com", instead_of=ProductPage) class LessProductPage( ProductPage, Returns[ProductLessFields], skip_nonitem_fields=True ): @@ -94,7 +94,7 @@ class LessProductPage( different item class. """ - expected_overrides = ProductPage + expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductLessFields expected_meta = {} @@ -104,7 +104,7 @@ def brand(self) -> str: return "brand" -@handle_urls("example.com", overrides=ProductPage, to_return=ProductSimilar) +@handle_urls("example.com", instead_of=ProductPage, to_return=ProductSimilar) class CustomProductPage(ProductPage, Returns[Product]): """A custom PO inheriting from a base PO returning the same fields but in a different item class. @@ -116,19 +116,19 @@ class CustomProductPage(ProductPage, Returns[Product]): the ``to_return`` parameter from ``@handle_urls`` is ignored. """ - expected_overrides = ProductPage + expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = Product expected_meta = {} -@handle_urls("example.com", overrides=ProductPage, to_return=ProductSimilar) +@handle_urls("example.com", instead_of=ProductPage, to_return=ProductSimilar) class CustomProductPageNoReturns(ProductPage): """Same case as with ``CustomProductPage`` but doesn't inherit from ``Returns[Product]``. """ - expected_overrides = ProductPage + expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = Product expected_meta = {} @@ -143,7 +143,7 @@ class CustomProductPageDataTypeOnly(Injectable): should properly use it in the rules. """ - expected_overrides = None + expected_instead_of = None expected_patterns = Patterns(["example.com"]) expected_to_return = Product expected_meta = {} diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 51491df6..58fac137 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -1,3 +1,6 @@ +import inspect +import warnings + import pytest from url_matcher import Patterns @@ -18,7 +21,13 @@ ProductSimilar, SimilarProductPage, ) -from web_poet import OverrideRule, PageObjectRegistry, consume_modules, default_registry +from web_poet import ( + OverrideRule, + PageObjectRegistry, + consume_modules, + default_registry, + handle_urls, +) POS = { POTopLevel1, @@ -96,7 +105,7 @@ def test_list_page_objects_all() -> None: for rule in rules: # We're ignoring the types below since mypy expects ``Type[ItemPage]`` # which doesn't contain the ``expected_*`` fields in our tests. - assert rule.instead_of == rule.use.expected_overrides, rule.use # type: ignore[attr-defined] + assert rule.instead_of == rule.use.expected_instead_of, rule.use # type: ignore[attr-defined] assert rule.for_patterns == rule.use.expected_patterns, rule.use # type: ignore[attr-defined] assert rule.to_return == rule.use.expected_to_return, rule.use # type: ignore[attr-defined] assert rule.meta == rule.use.expected_meta, rule.use # type: ignore[attr-defined] @@ -144,3 +153,20 @@ def test_from_override_rules() -> None: assert registry.get_overrides() == rules assert default_registry.get_overrides() != rules + + +def test_handle_urls_deprecation() -> None: + + with warnings.catch_warnings(record=True) as w: + + @handle_urls("example.com", overrides=CustomProductPage) + class PageWithDeprecatedOverrides: + ... + + w = [x for x in w if x.category is DeprecationWarning] + assert len(w) == 1 + assert str(w[0].message) == ( + "The 'overrides' parameter in @handle_urls is deprecated. Use the " + "'instead_of' parameter." + ) + assert w[0].lineno == inspect.getsourcelines(PageWithDeprecatedOverrides)[1] diff --git a/tests_extra/po_lib_sub_not_imported/__init__.py b/tests_extra/po_lib_sub_not_imported/__init__.py index 1c194513..e803b29f 100644 --- a/tests_extra/po_lib_sub_not_imported/__init__.py +++ b/tests_extra/po_lib_sub_not_imported/__init__.py @@ -12,7 +12,7 @@ class POBase: - expected_overrides: Type[ItemPage] + expected_instead_of: Type[ItemPage] expected_patterns: Patterns expected_meta: Dict[str, Any] @@ -21,9 +21,9 @@ class POLibSubOverridenNotImported: ... -@handle_urls("sub_example_not_imported.com", overrides=POLibSubOverridenNotImported) +@handle_urls("sub_example_not_imported.com", instead_of=POLibSubOverridenNotImported) class POLibSubNotImported(POBase): - expected_overrides = POLibSubOverridenNotImported + expected_instead_of = POLibSubOverridenNotImported expected_patterns = Patterns(["sub_example_not_imported.com"]) expected_to_return = None expected_meta = {} # type: ignore diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 30ae1ee3..40cabaf6 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -75,7 +75,7 @@ class PageObjectRegistry(dict): from web_poet import handle_urls, default_registry, WebPage - @handle_urls("example.com", overrides=ProductPageObject) + @handle_urls("example.com", instead_of=ProductPageObject) class ExampleComProductPage(WebPage): ... @@ -116,6 +116,7 @@ def handle_urls( include: Strings, *, overrides: Optional[Type[ItemPage]] = None, + instead_of: Optional[Type[ItemPage]] = None, to_return: Optional[Type] = None, exclude: Optional[Strings] = None, priority: int = 500, @@ -125,7 +126,7 @@ def handle_urls( Class decorator that indicates that the decorated Page Object should be used instead of the overridden one for a particular set the URLs. - The Page Object that is **overridden** is declared using the ``overrides`` + The Page Object that is **overridden** is declared using the ``instead_of`` parameter. The **override** mechanism only works on certain URLs that match the @@ -136,7 +137,7 @@ def handle_urls( Any extra parameters are stored as meta information that can be later used. :param include: The URLs that should be handled by the decorated Page Object. - :param overrides: The Page Object that should be `replaced`. + :param instead_of: The Page Object that should be `replaced`. :param to_return: The Item Class holding the data returned by the Page Object. This parameter is ignored if the Page Object has declared a data type using :class:`~.Returns` or :class:`~.ItemPage` @@ -154,6 +155,13 @@ def wrapper(cls): if derived_type and derived_type != ItemT: final_data_type = derived_type + if overrides is not None: + msg = ( + "The 'overrides' parameter in @handle_urls is deprecated. " + "Use the 'instead_of' parameter." + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + rule = OverrideRule( for_patterns=Patterns( include=as_list(include), @@ -161,7 +169,7 @@ def wrapper(cls): priority=priority, ), use=cls, - instead_of=overrides, + instead_of=instead_of, to_return=final_data_type, meta=kwargs, ) @@ -170,7 +178,7 @@ def wrapper(cls): self[cls] = rule else: warnings.warn( - f"Multiple @handle_urls annotations with the same 'overrides' " + f"Multiple @handle_urls annotations with the same 'instead_of' " f"are ignored in the same Registry. The following rule is " f"ignored:\n{rule}", stacklevel=2, From 904dd8c90603324a57ac344b6e13630e9fb8e1f2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 16:22:15 +0800 Subject: [PATCH 06/46] rename and deprecate OverrideRule into ApplyRule --- docs/intro/overrides.rst | 116 +++++++++++++++++++-------------------- tests/test_overrides.py | 22 ++++++-- web_poet/__init__.py | 2 +- web_poet/overrides.py | 35 ++++++------ 4 files changed, 94 insertions(+), 81 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 056e617e..7b7d2448 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -109,21 +109,21 @@ Retrieving all available Overrides ---------------------------------- The :meth:`~.PageObjectRegistry.get_overrides` method from the ``web_poet.default_registry`` -allows retrieval of all :class:`~.OverrideRule` in the given registry. +allows retrieval of all :class:`~.ApplyRule` in the given registry. Following from our example above, using it would be: .. code-block:: python from web_poet import default_registry - # Retrieves all OverrideRules that were registered in the registry + # Retrieves all ApplyRules that were registered in the registry rules = default_registry.get_overrides() print(len(rules)) # 3 - print(rules[0]) # OverrideRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + print(rules[0]) # ApplyRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) Remember that using ``@handle_urls`` to annotate the Page Objects would result -in the :class:`~.OverrideRule` to be written into ``web_poet.default_registry``. +in the :class:`~.ApplyRule` to be written into ``web_poet.default_registry``. .. warning:: @@ -135,7 +135,7 @@ in the :class:`~.OverrideRule` to be written into ``web_poet.default_registry``. Thus, for cases like importing and using Page Objects from other external packages, the ``@handle_urls`` annotations from these external sources must be read and processed properly. This ensures that the external Page Objects have all of their - :class:`~.OverrideRule` present. + :class:`~.ApplyRule` present. This can be done via the function named :func:`~.web_poet.overrides.consume_modules`. Here's an example: @@ -154,12 +154,12 @@ Using Overrides from External Packages -------------------------------------- Developers have the option to import existing Page Objects alongside the -:class:`~.OverrideRule` attached to them. This section aims to showcase different +:class:`~.ApplyRule` attached to them. This section aims to showcase different scenarios that come up when using multiple Page Object Projects. .. _`intro-rule-all`: -Using all available OverrideRules from multiple Page Object Projects +Using all available ApplyRules from multiple Page Object Projects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Let's suppose we have the following use case before us: @@ -174,7 +174,7 @@ Let's suppose we have the following use case before us: - Thus, you'd want to use the already available packages above and perhaps improve on them or create new Page Objects for new websites. -Remember that all of the :class:`~.OverrideRule` are declared by annotating +Remember that all of the :class:`~.ApplyRule` are declared by annotating Page Objects using the :func:`web_poet.handle_urls` via ``@handle_urls``. Thus, they can easily be accessed using the :meth:`~.PageObjectRegistry.get_overrides` of ``web_poet.default_registry``. @@ -197,10 +197,10 @@ This can be done something like: # The collected rules would then be as follows: print(rules) - # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + # 1. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 4. ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) .. note:: @@ -211,11 +211,11 @@ This can be done something like: .. _`intro-rule-subset`: -Using only a subset of the available OverrideRules +Using only a subset of the available ApplyRules ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose that the use case from the previous section has changed wherein a -subset of :class:`~.OverrideRule` would be used. This could be achieved by +subset of :class:`~.ApplyRule` would be used. This could be achieved by using the :meth:`~.PageObjectRegistry.search_overrides` method which allows for convenient selection of a subset of rules from a given registry. @@ -231,22 +231,22 @@ Here's an example of how you could manually select the rules using the ecom_rules = default_registry.search_overrides(instead_of=ecommerce_page_objects.EcomGenericPage) print(ecom_rules) - # OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) gadget_rules = default_registry.search_overrides(use=gadget_sites_page_objects.site_3.GadgetSite3) print(gadget_rules) - # OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) rules = ecom_rules + gadget_rules print(rules) - # OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) As you can see, using the :meth:`~.PageObjectRegistry.search_overrides` method allows you to -conveniently select for :class:`~.OverrideRule` which conform to a specific criteria. This -allows you to conveniently drill down to which :class:`~.OverrideRule` you're interested in +conveniently select for :class:`~.ApplyRule` which conform to a specific criteria. This +allows you to conveniently drill down to which :class:`~.ApplyRule` you're interested in using. .. _`overrides-custom-registry`: @@ -268,11 +268,11 @@ for this: Improving on external Page Objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There would be cases wherein you're using Page Objects with :class:`~.OverrideRule` +There would be cases wherein you're using Page Objects with :class:`~.ApplyRule` from external packages only to find out that a few of them lacks some of the fields or features that you need. -Let's suppose that we wanted to use `all` of the :class:`~.OverrideRule` similar +Let's suppose that we wanted to use `all` of the :class:`~.ApplyRule` similar to this section: :ref:`intro-rule-all`. However, the ``EcomSite1`` Page Object needs to properly handle some edge cases where some fields are not being extracted properly. One way to fix this is to subclass the said Page Object and improve its @@ -289,10 +289,10 @@ have the first approach as an example: # The collected rules would then be as follows: print(rules) - # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + # 1. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 4. ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) @handle_urls("site_1.com", instead_of=ecommerce_page_objects.EcomGenericPage, priority=1000) class ImprovedEcomSite1(ecommerce_page_objects.site_1.EcomSite1): @@ -301,13 +301,13 @@ have the first approach as an example: rules = default_registry.get_overrides() print(rules) - # 1. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 4. OverrideRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 5. OverrideRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=1000), use=, instead_of=, meta={}) + # 1. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 4. ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 5. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=1000), use=, instead_of=, to_return=None, meta={}) -Notice that we're adding a new :class:`~.OverrideRule` for the same URL pattern +Notice that we're adding a new :class:`~.ApplyRule` for the same URL pattern for ``site_1.com``. When the time comes that a Page Object needs to be selected when parsing ``site_1.com`` @@ -324,7 +324,7 @@ Handling conflicts from using Multiple External Packages -------------------------------------------------------- You might've observed from the previous section that retrieving the list of all -:class:`~.OverrideRule` from two different external packages may result in a +:class:`~.ApplyRule` from two different external packages may result in a conflict. We can take a look at the rules for **#2** and **#3** when we were importing all @@ -332,8 +332,8 @@ available rules: .. code-block:: python - # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) However, it's technically **NOT** a `conflict`, **yet**, since: @@ -346,14 +346,14 @@ However, it's technically **NOT** a `conflict`, **yet**, since: It would be only become a conflict if both rules for **site_2.com** `intend to replace the` **same** `Page Object`. -However, let's suppose that there are some :class:`~.OverrideRule` which actually +However, let's suppose that there are some :class:`~.ApplyRule` which actually result in a conflict. To give an example, let's suppose that rules **#2** and **#3** `intends to replace the` **same** `Page Object`. It would look something like: .. code-block:: python - # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) Notice that the ``instead_of`` param are the same and only the ``use`` param remained different. @@ -364,7 +364,7 @@ There are two main ways we recommend in solving this. **1. Priority Resolution** -If you notice, the ``for_patterns`` attribute of :class:`~.OverrideRule` is an +If you notice, the ``for_patterns`` attribute of :class:`~.ApplyRule` is an instance of `url_matcher.Patterns `_. This instance also has a ``priority`` param where a higher value will be chosen @@ -378,15 +378,15 @@ in times of conflict. Unfortunately, updating the ``priority`` value directly isn't possible as the :class:`url_matcher.Patterns` is a **frozen** `dataclass`. The same is true for -:class:`~.OverrideRule`. This is made by design so that they are hashable and could +:class:`~.ApplyRule`. This is made by design so that they are hashable and could be deduplicated immediately without consequences of them changing in value. The only way that the ``priority`` value can be changed is by creating a new -:class:`~.OverrideRule` with a different ``priority`` value (`higher if it needs +:class:`~.ApplyRule` with a different ``priority`` value (`higher if it needs more priority`). You don't necessarily need to `delete` the **old** -:class:`~.OverrideRule` since they will be resolved via ``priority`` anyways. +:class:`~.ApplyRule` since they will be resolved via ``priority`` anyways. -Creating a new :class:`~.OverrideRule` with a higher priority could be as easy as: +Creating a new :class:`~.ApplyRule` with a higher priority could be as easy as: 1. Subclassing the Page Object in question. 2. Create a new :func:`web_poet.handle_urls` annotation with the same URL @@ -405,14 +405,14 @@ Here's an example: return super().to_item() Now, the conflicting **#2** and **#3** rules would never be selected because of -the new :class:`~.OverrideRule` having a much higher priority (see rule **#4**): +the new :class:`~.ApplyRule` having a much higher priority (see rule **#4**): .. code-block:: python - # 2. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - # 3. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) - # 4. OverrideRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=1000), use=, instead_of=, meta={}) + # 4. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=1000), use=, instead_of=, to_return=None, meta={}) A similar idea was also discussed in the :ref:`intro-improve-po` section. @@ -420,7 +420,7 @@ A similar idea was also discussed in the :ref:`intro-improve-po` section. **2. Specifically Selecting the Rules** When the last resort of ``priority``-resolution doesn't work, then you could always -specifically select the list of :class:`~.OverrideRule` you want to use. +specifically select the list of :class:`~.ApplyRule` you want to use. We **recommend** in creating an **inclusion**-list rather than an **exclusion**-list since the latter is quite brittle. For instance, an external package you're using @@ -429,8 +429,8 @@ were recently added. This could lead to a `silent-error` of receiving a differen set of rules than expected. This **inclusion**-list approach can be done by importing the Page Objects directly -and creating instances of :class:`~.OverrideRule` from it. You could also import -all of the available :class:`~.OverrideRule` using :meth:`~.PageObjectRegistry.get_overrides` +and creating instances of :class:`~.ApplyRule` from it. You could also import +all of the available :class:`~.ApplyRule` using :meth:`~.PageObjectRegistry.get_overrides` to sift through the list of available rules and manually selecting the rules you need. Most of the time, the needed rules are the ones which uses the Page Objects we're @@ -445,9 +445,9 @@ easily find the Page Object's rule using its `key`. Here's an example: consume_modules("package_A", "package_B", "package_C") rules = [ - default_registry[package_A.PageObject1], # OverrideRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - default_registry[package_B.PageObject2], # OverrideRule(for_patterns=Patterns(include=['site_B.com'], exclude=[], priority=500), use=, instead_of=, meta={}) - default_registry[package_C.PageObject3], # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=, instead_of=, meta={}) + default_registry[package_A.PageObject1], # ApplyRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + default_registry[package_B.PageObject2], # ApplyRule(for_patterns=Patterns(include=['site_B.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) + default_registry[package_C.PageObject3], # ApplyRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) ] Another approach would be using the :meth:`~.PageObjectRegistry.search_overrides` @@ -468,7 +468,7 @@ Here's an example: rule_from_A = default_registry.search_overrides(use=package_A.PageObject1) print(rule_from_A) - # [OverrideRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=, instead_of=, meta={})] + # [ApplyRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={})] rule_from_B = default_registry.search_overrides(instead_of=GenericProductPage) print(rule_from_B) @@ -477,8 +477,8 @@ Here's an example: rule_from_C = default_registry.search_overrides(for_patterns=Patterns(include=["site_C.com"])) print(rule_from_C) # [ - # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=, instead_of=, meta={}), - # OverrideRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=1000), use=, instead_of=, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}), + # ApplyRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=1000), use=, instead_of=, to_return=None, meta={}) # ] rules = rule_from_A + rule_from_B + rule_from_C diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 58fac137..38b8fc12 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -22,6 +22,7 @@ SimilarProductPage, ) from web_poet import ( + ApplyRule, OverrideRule, PageObjectRegistry, consume_modules, @@ -47,19 +48,19 @@ def test_override_rule_uniqueness() -> None: - """The same instance of an OverrideRule with the same attribute values should + """The same instance of an ApplyRule with the same attribute values should have the same hash identity. """ patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) - rule1 = OverrideRule( + rule1 = ApplyRule( for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden2, meta={"key_1": 1}, ) - rule2 = OverrideRule( + rule2 = ApplyRule( for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden2, @@ -68,13 +69,13 @@ def test_override_rule_uniqueness() -> None: # The ``meta`` parameter is ignored in the hash. assert hash(rule1) == hash(rule2) - rule1 = OverrideRule( + rule1 = ApplyRule( for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden2, to_return=Product, ) - rule2 = OverrideRule( + rule2 = ApplyRule( for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden2, @@ -142,7 +143,7 @@ def test_registry_search_overrides() -> None: def test_from_override_rules() -> None: rules = [ - OverrideRule( + ApplyRule( for_patterns=Patterns(include=["sample.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden2, @@ -170,3 +171,12 @@ class PageWithDeprecatedOverrides: "'instead_of' parameter." ) assert w[0].lineno == inspect.getsourcelines(PageWithDeprecatedOverrides)[1] + + +def test_override_rule_deprecation() -> None: + msg = ( + "web_poet.overrides.OverrideRule is deprecated, " + "instantiate web_poet.overrides.ApplyRule instead." + ) + with pytest.warns(DeprecationWarning, match=msg): + OverrideRule(for_patterns=None, use=None) diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 0d7fb36e..64ce6c46 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,5 +1,5 @@ from .fields import field, item_from_fields, item_from_fields_sync -from .overrides import OverrideRule, PageObjectRegistry, consume_modules +from .overrides import ApplyRule, OverrideRule, PageObjectRegistry, consume_modules from .page_inputs import ( BrowserHtml, HttpClient, diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 40cabaf6..0e76c375 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -13,7 +13,7 @@ from web_poet._typing import get_generic_parameter from web_poet.pages import ItemPage, ItemT -from web_poet.utils import as_list +from web_poet.utils import _create_deprecated_class, as_list Strings = Union[str, Iterable[str]] @@ -21,12 +21,12 @@ @dataclass(frozen=True) -class OverrideRule: +class ApplyRule: """A single override rule that specifies when a Page Object should be used in lieu of another. This is instantiated when using the :func:`web_poet.handle_urls` decorator. - It's also being returned as a ``List[OverrideRule]`` when calling the + It's also being returned as a ``List[ApplyRule]`` when calling the ``web_poet.default_registry``'s :meth:`~.PageObjectRegistry.get_overrides` method. @@ -44,7 +44,7 @@ class OverrideRule: .. tip:: - The :class:`~.OverrideRule` is also hashable. This makes it easy to store + The :class:`~.ApplyRule` is also hashable. This makes it easy to store unique rules and identify any duplicates. """ @@ -63,8 +63,8 @@ class PageObjectRegistry(dict): for a given URL matching rule. Note that it's simply a ``dict`` subclass with added functionalities on - storing, retrieving, and searching for the :class:`~.OverrideRule` instances. - The **value** represents the :class:`~.OverrideRule` instance from which the + storing, retrieving, and searching for the :class:`~.ApplyRule` instances. + The **value** represents the :class:`~.ApplyRule` instance from which the Page Object in the **key** is allowed to be used. Since it's essentially a ``dict``, you can use any ``dict`` operations with it. @@ -101,10 +101,10 @@ class ExampleComProductPage(WebPage): @classmethod def from_override_rules( - cls: Type[PageObjectRegistryTV], rules: List[OverrideRule] + cls: Type[PageObjectRegistryTV], rules: List[ApplyRule] ) -> PageObjectRegistryTV: """An alternative constructor for creating a :class:`~.PageObjectRegistry` - instance by accepting a list of :class:`~.OverrideRule`. + instance by accepting a list of :class:`~.ApplyRule`. This is useful in cases wherein you need to store some selected rules from multiple external packages. @@ -162,7 +162,7 @@ def wrapper(cls): ) warnings.warn(msg, DeprecationWarning, stacklevel=2) - rule = OverrideRule( + rule = ApplyRule( for_patterns=Patterns( include=as_list(include), exclude=as_list(exclude), @@ -188,8 +188,8 @@ def wrapper(cls): return wrapper - def get_overrides(self) -> List[OverrideRule]: - """Returns all of the :class:`~.OverrideRule` that were declared using + def get_overrides(self) -> List[ApplyRule]: + """Returns all of the :class:`~.ApplyRule` that were declared using the ``@handle_urls`` annotation. .. warning:: @@ -200,8 +200,8 @@ def get_overrides(self) -> List[OverrideRule]: """ return list(self.values()) - def search_overrides(self, **kwargs) -> List[OverrideRule]: - """Returns any :class:`OverrideRule` that has any of its attributes + def search_overrides(self, **kwargs) -> List[ApplyRule]: + """Returns any :class:`ApplyRule` that has any of its attributes match the rules inside the registry. Sample usage: @@ -223,7 +223,7 @@ def search_overrides(self, **kwargs) -> List[OverrideRule]: getter = attrgetter(*kwargs.keys()) - def matcher(rule: OverrideRule): + def matcher(rule: ApplyRule): attribs = getter(rule) if not isinstance(attribs, tuple): attribs = (attribs,) @@ -276,7 +276,7 @@ def consume_modules(*modules: str) -> None: consume_modules("other_external_pkg.po", "another_pkg.lib") rules = default_registry.get_overrides() - For this case, the :class:`~.OverrideRule` are coming from: + For this case, the :class:`~.ApplyRule` are coming from: - ``my_page_obj_project`` `(since it's the same module as the file above)` - ``other_external_pkg.po`` @@ -286,7 +286,7 @@ def consume_modules(*modules: str) -> None: If the ``default_registry`` had other ``@handle_urls`` annotations outside of the packages/modules listed above, then the corresponding - :class:`~.OverrideRule` won't be returned. Unless, they were recursively + :class:`~.ApplyRule` won't be returned. Unless, they were recursively imported in some way similar to :func:`~.web_poet.overrides.consume_modules`. """ @@ -296,3 +296,6 @@ def consume_modules(*modules: str) -> None: # Inspired by itertools recipe: https://docs.python.org/3/library/itertools.html # Using a deque() results in a tiny bit performance improvement that list(). deque(gen, maxlen=0) + + +OverrideRule = _create_deprecated_class("OverrideRule", ApplyRule, warn_once=False) From 03f5656163a195e968f8c190f5f52010a22eafc7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 16:37:43 +0800 Subject: [PATCH 07/46] fix tests --- tests/test_overrides.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 38b8fc12..0effc696 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -1,4 +1,3 @@ -import inspect import warnings import pytest @@ -170,7 +169,6 @@ class PageWithDeprecatedOverrides: "The 'overrides' parameter in @handle_urls is deprecated. Use the " "'instead_of' parameter." ) - assert w[0].lineno == inspect.getsourcelines(PageWithDeprecatedOverrides)[1] def test_override_rule_deprecation() -> None: From 3ed8415d4aa780e4147c1a4fa90fcd9ac3c3e609 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 16:49:24 +0800 Subject: [PATCH 08/46] update CHANGELOG with regards to ApplyRule and @handle_urls changes --- CHANGELOG.rst | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ab3e8d51..b6e539a4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,30 @@ Changelog ========= +TBD +--- + +* New ``ApplyRule`` class created by the ``@handle_urls`` decorator. + This is nearly identical with ``OverrideRule`` except that it's accepting + a ``to_return`` parameter which signifies the data container class that + the Page Object returns. + +* Modify the call signature of ``handle_urls``: + + * New ``to_return`` parameter which signifies the data container class that + the Page Object returns. This behaves exactly the same with ``overrides`` + but it's more consistent with the attribute names of ``ApplyRules`` that + it creates. + * New ``instead_of`` parameter which does the same thing with ``overrides``. + * The old ``overrides`` parameter is not required anymore as it's set for + deprecation. + +Deprecations: + +* The ``overrides`` parameter from ``@handle_urls`` is now deprecated. + Use the ``instead_of`` parameter instead. +* ``OverrideRule`` is now deprecated. Use ``ApplyRule`` instead. + 0.5.1 (2022-09-23) ------------------ From 3a5499c2d5e548d1b4d6225a378e634c79819e4a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 17:21:14 +0800 Subject: [PATCH 09/46] create from_apply_rules method in PageObjectRegistry; deprecate from_override_rules --- CHANGELOG.rst | 2 ++ docs/intro/overrides.rst | 4 ++-- tests/test_overrides.py | 26 +++++++++++++++++++++++--- web_poet/overrides.py | 14 +++++++++++++- 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b6e539a4..5ecd2cc6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -25,6 +25,8 @@ Deprecations: * The ``overrides`` parameter from ``@handle_urls`` is now deprecated. Use the ``instead_of`` parameter instead. * ``OverrideRule`` is now deprecated. Use ``ApplyRule`` instead. +* The ``from_override_rules`` method of ``PageObjectRegistry`` is now deprecated. + Use ``from_apply_rules`` instead. 0.5.1 (2022-09-23) ------------------ diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 7b7d2448..d2af5cd9 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -253,14 +253,14 @@ using. After gathering all the pre-selected rules, we can then store it in a new instance of :class:`~.PageObjectRegistry` in order to separate it from the ``default_registry`` -which contains all of the rules. We can use the :meth:`~.PageObjectRegistry.from_override_rules` +which contains all of the rules. We can use the :meth:`~.PageObjectRegistry.from_apply_rules` for this: .. code-block:: python from web_poet import PageObjectRegistry - my_new_registry = PageObjectRegistry.from_override_rules(rules) + my_new_registry = PageObjectRegistry.from_apply_rules(rules) .. _`intro-improve-po`: diff --git a/tests/test_overrides.py b/tests/test_overrides.py index 0effc696..a8d010ec 100644 --- a/tests/test_overrides.py +++ b/tests/test_overrides.py @@ -46,7 +46,7 @@ } -def test_override_rule_uniqueness() -> None: +def test_apply_rule_uniqueness() -> None: """The same instance of an ApplyRule with the same attribute values should have the same hash identity. """ @@ -140,7 +140,7 @@ def test_registry_search_overrides() -> None: assert len(rules) == 0 -def test_from_override_rules() -> None: +def test_from_apply_rules() -> None: rules = [ ApplyRule( for_patterns=Patterns(include=["sample.com"]), @@ -149,7 +149,27 @@ def test_from_override_rules() -> None: ) ] - registry = PageObjectRegistry.from_override_rules(rules) + registry = PageObjectRegistry.from_apply_rules(rules) + + assert registry.get_overrides() == rules + assert default_registry.get_overrides() != rules + + +def test_from_override_rules_deprecation() -> None: + rules = [ + ApplyRule( + for_patterns=Patterns(include=["sample.com"]), + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + ) + ] + + msg = ( + "The 'from_override_rules' method is deprecated. " + "Use 'from_apply_rules' instead." + ) + with pytest.warns(DeprecationWarning, match=msg): + registry = PageObjectRegistry.from_override_rules(rules) assert registry.get_overrides() == rules assert default_registry.get_overrides() != rules diff --git a/web_poet/overrides.py b/web_poet/overrides.py index 0e76c375..bf699bbb 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -100,7 +100,7 @@ class ExampleComProductPage(WebPage): """ @classmethod - def from_override_rules( + def from_apply_rules( cls: Type[PageObjectRegistryTV], rules: List[ApplyRule] ) -> PageObjectRegistryTV: """An alternative constructor for creating a :class:`~.PageObjectRegistry` @@ -111,6 +111,18 @@ def from_override_rules( """ return cls({rule.use: rule for rule in rules}) + @classmethod + def from_override_rules( + cls: Type[PageObjectRegistryTV], rules: List[ApplyRule] + ) -> PageObjectRegistryTV: + """Deprecated. Use :meth:`~.PageObjectRegistry.from_apply_rules` instead.""" + msg = ( + "The 'from_override_rules' method is deprecated. " + "Use 'from_apply_rules' instead." + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + return cls.from_apply_rules(rules) + def handle_urls( self, include: Strings, From 01112e7e316ea2756a8692b752d757aeae280f23 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 20:42:53 +0800 Subject: [PATCH 10/46] rename 'web_poet.overrides' into 'web_poet.rules' --- CHANGELOG.rst | 1 + docs/api-reference.rst | 2 +- tests/{test_overrides.py => test_rules.py} | 4 +- web_poet/__init__.py | 2 +- web_poet/overrides.py | 313 +-------------------- web_poet/rules.py | 313 +++++++++++++++++++++ 6 files changed, 320 insertions(+), 315 deletions(-) rename tests/{test_overrides.py => test_rules.py} (98%) create mode 100644 web_poet/rules.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5ecd2cc6..5b6c3be0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -27,6 +27,7 @@ Deprecations: * ``OverrideRule`` is now deprecated. Use ``ApplyRule`` instead. * The ``from_override_rules`` method of ``PageObjectRegistry`` is now deprecated. Use ``from_apply_rules`` instead. +* The ``web_poet.overrides`` is deprecated. Use ``web_poet.rules`` instead. 0.5.1 (2022-09-23) ------------------ diff --git a/docs/api-reference.rst b/docs/api-reference.rst index 2f931e86..02c141d0 100644 --- a/docs/api-reference.rst +++ b/docs/api-reference.rst @@ -91,7 +91,7 @@ use cases and some examples. .. autofunction:: web_poet.handle_urls -.. automodule:: web_poet.overrides +.. automodule:: web_poet.rules :members: :exclude-members: handle_urls diff --git a/tests/test_overrides.py b/tests/test_rules.py similarity index 98% rename from tests/test_overrides.py rename to tests/test_rules.py index a8d010ec..8597e064 100644 --- a/tests/test_overrides.py +++ b/tests/test_rules.py @@ -193,8 +193,8 @@ class PageWithDeprecatedOverrides: def test_override_rule_deprecation() -> None: msg = ( - "web_poet.overrides.OverrideRule is deprecated, " - "instantiate web_poet.overrides.ApplyRule instead." + "web_poet.rules.OverrideRule is deprecated, " + "instantiate web_poet.rules.ApplyRule instead." ) with pytest.warns(DeprecationWarning, match=msg): OverrideRule(for_patterns=None, use=None) diff --git a/web_poet/__init__.py b/web_poet/__init__.py index 64ce6c46..dbb3afc2 100644 --- a/web_poet/__init__.py +++ b/web_poet/__init__.py @@ -1,5 +1,4 @@ from .fields import field, item_from_fields, item_from_fields_sync -from .overrides import ApplyRule, OverrideRule, PageObjectRegistry, consume_modules from .page_inputs import ( BrowserHtml, HttpClient, @@ -15,6 +14,7 @@ ) from .pages import Injectable, ItemPage, ItemWebPage, Returns, WebPage from .requests import request_downloader_var +from .rules import ApplyRule, OverrideRule, PageObjectRegistry, consume_modules from .utils import cached_method default_registry = PageObjectRegistry() diff --git a/web_poet/overrides.py b/web_poet/overrides.py index bf699bbb..a37db47e 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -1,313 +1,4 @@ -from __future__ import annotations # https://www.python.org/dev/peps/pep-0563/ - -import importlib -import importlib.util -import pkgutil import warnings -from collections import deque -from dataclasses import dataclass, field -from operator import attrgetter -from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar, Union - -from url_matcher import Patterns - -from web_poet._typing import get_generic_parameter -from web_poet.pages import ItemPage, ItemT -from web_poet.utils import _create_deprecated_class, as_list - -Strings = Union[str, Iterable[str]] - -PageObjectRegistryTV = TypeVar("PageObjectRegistryTV", bound="PageObjectRegistry") - - -@dataclass(frozen=True) -class ApplyRule: - """A single override rule that specifies when a Page Object should be used - in lieu of another. - - This is instantiated when using the :func:`web_poet.handle_urls` decorator. - It's also being returned as a ``List[ApplyRule]`` when calling the - ``web_poet.default_registry``'s :meth:`~.PageObjectRegistry.get_overrides` - method. - - You can access any of its attributes: - - * ``for_patterns`` - contains the list of URL patterns associated with - this rule. You can read the API documentation of the `url-matcher - `_ package for more information - about the patterns. - * ``use`` - The Page Object that will be **used**. - * ``instead_of`` - The Page Object that will be **replaced**. - * ``to_return`` - The data container class that the Page Object returns. - * ``meta`` - Any other information you may want to store. This doesn't - do anything for now but may be useful for future API updates. - - .. tip:: - - The :class:`~.ApplyRule` is also hashable. This makes it easy to store - unique rules and identify any duplicates. - """ - - for_patterns: Patterns - use: Type[ItemPage] - instead_of: Optional[Type[ItemPage]] = None - to_return: Optional[Type[Any]] = None - meta: Dict[str, Any] = field(default_factory=dict) - - def __hash__(self): - return hash((self.for_patterns, self.use, self.instead_of, self.to_return)) - - -class PageObjectRegistry(dict): - """This contains the mapping rules that associates the Page Objects available - for a given URL matching rule. - - Note that it's simply a ``dict`` subclass with added functionalities on - storing, retrieving, and searching for the :class:`~.ApplyRule` instances. - The **value** represents the :class:`~.ApplyRule` instance from which the - Page Object in the **key** is allowed to be used. Since it's essentially a - ``dict``, you can use any ``dict`` operations with it. - - ``web-poet`` already provides a default Registry named ``default_registry`` - for convenience. It can be directly accessed via: - - .. code-block:: python - - from web_poet import handle_urls, default_registry, WebPage - - @handle_urls("example.com", instead_of=ProductPageObject) - class ExampleComProductPage(WebPage): - ... - - override_rules = default_registry.get_overrides() - - Notice that the ``@handle_urls`` that we're using is a part of the - ``default_registry``. This provides a shorter and quicker way to interact - with the built-in default :class:`~.PageObjectRegistry` instead of writing - the longer ``@default_registry.handle_urls``. - - .. note:: - - It is encouraged to simply use and import the already existing registry - via ``from web_poet import default_registry`` instead of creating your - own :class:`~.PageObjectRegistry` instance. Using multiple registries - would be unwieldy in most cases. - - However, it might be applicable in certain scenarios like storing custom - rules to separate it from the ``default_registry``. This :ref:`example - ` from the tutorial section may provide some - context. - """ - - @classmethod - def from_apply_rules( - cls: Type[PageObjectRegistryTV], rules: List[ApplyRule] - ) -> PageObjectRegistryTV: - """An alternative constructor for creating a :class:`~.PageObjectRegistry` - instance by accepting a list of :class:`~.ApplyRule`. - - This is useful in cases wherein you need to store some selected rules - from multiple external packages. - """ - return cls({rule.use: rule for rule in rules}) - - @classmethod - def from_override_rules( - cls: Type[PageObjectRegistryTV], rules: List[ApplyRule] - ) -> PageObjectRegistryTV: - """Deprecated. Use :meth:`~.PageObjectRegistry.from_apply_rules` instead.""" - msg = ( - "The 'from_override_rules' method is deprecated. " - "Use 'from_apply_rules' instead." - ) - warnings.warn(msg, DeprecationWarning, stacklevel=2) - return cls.from_apply_rules(rules) - - def handle_urls( - self, - include: Strings, - *, - overrides: Optional[Type[ItemPage]] = None, - instead_of: Optional[Type[ItemPage]] = None, - to_return: Optional[Type] = None, - exclude: Optional[Strings] = None, - priority: int = 500, - **kwargs, - ): - """ - Class decorator that indicates that the decorated Page Object should be - used instead of the overridden one for a particular set the URLs. - - The Page Object that is **overridden** is declared using the ``instead_of`` - parameter. - - The **override** mechanism only works on certain URLs that match the - ``include`` and ``exclude`` parameters. See the documentation of the - `url-matcher `_ package for more - information about them. - - Any extra parameters are stored as meta information that can be later used. - - :param include: The URLs that should be handled by the decorated Page Object. - :param instead_of: The Page Object that should be `replaced`. - :param to_return: The Item Class holding the data returned by the Page Object. - This parameter is ignored if the Page Object has declared a data type - using :class:`~.Returns` or :class:`~.ItemPage` - (e.g. ``class ProductPageObject(ItemPage[ProductItem])``). - :param exclude: The URLs over which the override should **not** happen. - :param priority: The resolution priority in case of `conflicting` rules. - A conflict happens when the ``include``, ``override``, and ``exclude`` - parameters are the same. If so, the `highest priority` will be - chosen. - """ - - def wrapper(cls): - final_data_type = to_return - derived_type = get_generic_parameter(cls) - if derived_type and derived_type != ItemT: - final_data_type = derived_type - - if overrides is not None: - msg = ( - "The 'overrides' parameter in @handle_urls is deprecated. " - "Use the 'instead_of' parameter." - ) - warnings.warn(msg, DeprecationWarning, stacklevel=2) - - rule = ApplyRule( - for_patterns=Patterns( - include=as_list(include), - exclude=as_list(exclude), - priority=priority, - ), - use=cls, - instead_of=instead_of, - to_return=final_data_type, - meta=kwargs, - ) - # If it was already defined, we don't want to override it - if cls not in self: - self[cls] = rule - else: - warnings.warn( - f"Multiple @handle_urls annotations with the same 'instead_of' " - f"are ignored in the same Registry. The following rule is " - f"ignored:\n{rule}", - stacklevel=2, - ) - - return cls - - return wrapper - - def get_overrides(self) -> List[ApplyRule]: - """Returns all of the :class:`~.ApplyRule` that were declared using - the ``@handle_urls`` annotation. - - .. warning:: - - Remember to consider calling :func:`~.web_poet.overrides.consume_modules` - beforehand to recursively import all submodules which contains the - ``@handle_urls`` annotations from external Page Objects. - """ - return list(self.values()) - - def search_overrides(self, **kwargs) -> List[ApplyRule]: - """Returns any :class:`ApplyRule` that has any of its attributes - match the rules inside the registry. - - Sample usage: - - .. code-block:: python - - rules = registry.search_overrides(use=ProductPO, instead_of=GenericPO) - print(len(rules)) # 1 - - """ - - # Short-circuit operation if "use" is the only search param used, since - # we know that it's being used as the dict key. - if {"use"} == kwargs.keys(): - rule = self.get(kwargs["use"]) - if rule: - return [rule] - return [] - - getter = attrgetter(*kwargs.keys()) - - def matcher(rule: ApplyRule): - attribs = getter(rule) - if not isinstance(attribs, tuple): - attribs = (attribs,) - if attribs == tuple(kwargs.values()): - return True - return False - - results = [] - for rule in self.get_overrides(): - if matcher(rule): - results.append(rule) - return results - - -def _walk_module(module: str) -> Iterable: - """Return all modules from a module recursively. - - Note that this will import all the modules and submodules. It returns the - provided module as well. - """ - - def onerror(err): - raise err # pragma: no cover - - spec = importlib.util.find_spec(module) - if not spec: - raise ImportError(f"Module {module} not found") - mod = importlib.import_module(spec.name) - yield mod - if spec.submodule_search_locations: - for info in pkgutil.walk_packages( - spec.submodule_search_locations, f"{spec.name}.", onerror - ): - mod = importlib.import_module(info.name) - yield mod - - -def consume_modules(*modules: str) -> None: - """This recursively imports all packages/modules so that the ``@handle_urls`` - annotation are properly discovered and imported. - - Let's take a look at an example: - - .. code-block:: python - - # FILE: my_page_obj_project/load_rules.py - - from web_poet import default_registry, consume_modules - - consume_modules("other_external_pkg.po", "another_pkg.lib") - rules = default_registry.get_overrides() - - For this case, the :class:`~.ApplyRule` are coming from: - - - ``my_page_obj_project`` `(since it's the same module as the file above)` - - ``other_external_pkg.po`` - - ``another_pkg.lib`` - - any other modules that was imported in the same process inside the - packages/modules above. - - If the ``default_registry`` had other ``@handle_urls`` annotations outside - of the packages/modules listed above, then the corresponding - :class:`~.ApplyRule` won't be returned. Unless, they were recursively - imported in some way similar to :func:`~.web_poet.overrides.consume_modules`. - """ - - for module in modules: - gen = _walk_module(module) - - # Inspired by itertools recipe: https://docs.python.org/3/library/itertools.html - # Using a deque() results in a tiny bit performance improvement that list(). - deque(gen, maxlen=0) - -OverrideRule = _create_deprecated_class("OverrideRule", ApplyRule, warn_once=False) +msg = "The 'web_poet.overrides' module has been moved into 'web_poet.rules'." +warnings.warn(msg, DeprecationWarning, stacklevel=2) diff --git a/web_poet/rules.py b/web_poet/rules.py new file mode 100644 index 00000000..519fcf82 --- /dev/null +++ b/web_poet/rules.py @@ -0,0 +1,313 @@ +from __future__ import annotations # https://www.python.org/dev/peps/pep-0563/ + +import importlib +import importlib.util +import pkgutil +import warnings +from collections import deque +from dataclasses import dataclass, field +from operator import attrgetter +from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar, Union + +from url_matcher import Patterns + +from web_poet._typing import get_generic_parameter +from web_poet.pages import ItemPage, ItemT +from web_poet.utils import _create_deprecated_class, as_list + +Strings = Union[str, Iterable[str]] + +PageObjectRegistryTV = TypeVar("PageObjectRegistryTV", bound="PageObjectRegistry") + + +@dataclass(frozen=True) +class ApplyRule: + """A single override rule that specifies when a Page Object should be used + in lieu of another. + + This is instantiated when using the :func:`web_poet.handle_urls` decorator. + It's also being returned as a ``List[ApplyRule]`` when calling the + ``web_poet.default_registry``'s :meth:`~.PageObjectRegistry.get_overrides` + method. + + You can access any of its attributes: + + * ``for_patterns`` - contains the list of URL patterns associated with + this rule. You can read the API documentation of the `url-matcher + `_ package for more information + about the patterns. + * ``use`` - The Page Object that will be **used**. + * ``instead_of`` - The Page Object that will be **replaced**. + * ``to_return`` - The data container class that the Page Object returns. + * ``meta`` - Any other information you may want to store. This doesn't + do anything for now but may be useful for future API updates. + + .. tip:: + + The :class:`~.ApplyRule` is also hashable. This makes it easy to store + unique rules and identify any duplicates. + """ + + for_patterns: Patterns + use: Type[ItemPage] + instead_of: Optional[Type[ItemPage]] = None + to_return: Optional[Type[Any]] = None + meta: Dict[str, Any] = field(default_factory=dict) + + def __hash__(self): + return hash((self.for_patterns, self.use, self.instead_of, self.to_return)) + + +class PageObjectRegistry(dict): + """This contains the mapping rules that associates the Page Objects available + for a given URL matching rule. + + Note that it's simply a ``dict`` subclass with added functionalities on + storing, retrieving, and searching for the :class:`~.ApplyRule` instances. + The **value** represents the :class:`~.ApplyRule` instance from which the + Page Object in the **key** is allowed to be used. Since it's essentially a + ``dict``, you can use any ``dict`` operations with it. + + ``web-poet`` already provides a default Registry named ``default_registry`` + for convenience. It can be directly accessed via: + + .. code-block:: python + + from web_poet import handle_urls, default_registry, WebPage + + @handle_urls("example.com", instead_of=ProductPageObject) + class ExampleComProductPage(WebPage): + ... + + override_rules = default_registry.get_overrides() + + Notice that the ``@handle_urls`` that we're using is a part of the + ``default_registry``. This provides a shorter and quicker way to interact + with the built-in default :class:`~.PageObjectRegistry` instead of writing + the longer ``@default_registry.handle_urls``. + + .. note:: + + It is encouraged to simply use and import the already existing registry + via ``from web_poet import default_registry`` instead of creating your + own :class:`~.PageObjectRegistry` instance. Using multiple registries + would be unwieldy in most cases. + + However, it might be applicable in certain scenarios like storing custom + rules to separate it from the ``default_registry``. This :ref:`example + ` from the tutorial section may provide some + context. + """ + + @classmethod + def from_apply_rules( + cls: Type[PageObjectRegistryTV], rules: List[ApplyRule] + ) -> PageObjectRegistryTV: + """An alternative constructor for creating a :class:`~.PageObjectRegistry` + instance by accepting a list of :class:`~.ApplyRule`. + + This is useful in cases wherein you need to store some selected rules + from multiple external packages. + """ + return cls({rule.use: rule for rule in rules}) + + @classmethod + def from_override_rules( + cls: Type[PageObjectRegistryTV], rules: List[ApplyRule] + ) -> PageObjectRegistryTV: + """Deprecated. Use :meth:`~.PageObjectRegistry.from_apply_rules` instead.""" + msg = ( + "The 'from_override_rules' method is deprecated. " + "Use 'from_apply_rules' instead." + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + return cls.from_apply_rules(rules) + + def handle_urls( + self, + include: Strings, + *, + overrides: Optional[Type[ItemPage]] = None, + instead_of: Optional[Type[ItemPage]] = None, + to_return: Optional[Type] = None, + exclude: Optional[Strings] = None, + priority: int = 500, + **kwargs, + ): + """ + Class decorator that indicates that the decorated Page Object should be + used instead of the overridden one for a particular set the URLs. + + The Page Object that is **overridden** is declared using the ``instead_of`` + parameter. + + The **override** mechanism only works on certain URLs that match the + ``include`` and ``exclude`` parameters. See the documentation of the + `url-matcher `_ package for more + information about them. + + Any extra parameters are stored as meta information that can be later used. + + :param include: The URLs that should be handled by the decorated Page Object. + :param instead_of: The Page Object that should be `replaced`. + :param to_return: The Item Class holding the data returned by the Page Object. + This parameter is ignored if the Page Object has declared a data type + using :class:`~.Returns` or :class:`~.ItemPage` + (e.g. ``class ProductPageObject(ItemPage[ProductItem])``). + :param exclude: The URLs over which the override should **not** happen. + :param priority: The resolution priority in case of `conflicting` rules. + A conflict happens when the ``include``, ``override``, and ``exclude`` + parameters are the same. If so, the `highest priority` will be + chosen. + """ + + def wrapper(cls): + final_data_type = to_return + derived_type = get_generic_parameter(cls) + if derived_type and derived_type != ItemT: + final_data_type = derived_type + + if overrides is not None: + msg = ( + "The 'overrides' parameter in @handle_urls is deprecated. " + "Use the 'instead_of' parameter." + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + + rule = ApplyRule( + for_patterns=Patterns( + include=as_list(include), + exclude=as_list(exclude), + priority=priority, + ), + use=cls, + instead_of=instead_of, + to_return=final_data_type, + meta=kwargs, + ) + # If it was already defined, we don't want to override it + if cls not in self: + self[cls] = rule + else: + warnings.warn( + f"Multiple @handle_urls annotations with the same 'instead_of' " + f"are ignored in the same Registry. The following rule is " + f"ignored:\n{rule}", + stacklevel=2, + ) + + return cls + + return wrapper + + def get_overrides(self) -> List[ApplyRule]: + """Returns all of the :class:`~.ApplyRule` that were declared using + the ``@handle_urls`` annotation. + + .. warning:: + + Remember to consider calling :func:`~.web_poet.rules.consume_modules` + beforehand to recursively import all submodules which contains the + ``@handle_urls`` annotations from external Page Objects. + """ + return list(self.values()) + + def search_overrides(self, **kwargs) -> List[ApplyRule]: + """Returns any :class:`ApplyRule` that has any of its attributes + match the rules inside the registry. + + Sample usage: + + .. code-block:: python + + rules = registry.search_overrides(use=ProductPO, instead_of=GenericPO) + print(len(rules)) # 1 + + """ + + # Short-circuit operation if "use" is the only search param used, since + # we know that it's being used as the dict key. + if {"use"} == kwargs.keys(): + rule = self.get(kwargs["use"]) + if rule: + return [rule] + return [] + + getter = attrgetter(*kwargs.keys()) + + def matcher(rule: ApplyRule): + attribs = getter(rule) + if not isinstance(attribs, tuple): + attribs = (attribs,) + if attribs == tuple(kwargs.values()): + return True + return False + + results = [] + for rule in self.get_overrides(): + if matcher(rule): + results.append(rule) + return results + + +def _walk_module(module: str) -> Iterable: + """Return all modules from a module recursively. + + Note that this will import all the modules and submodules. It returns the + provided module as well. + """ + + def onerror(err): + raise err # pragma: no cover + + spec = importlib.util.find_spec(module) + if not spec: + raise ImportError(f"Module {module} not found") + mod = importlib.import_module(spec.name) + yield mod + if spec.submodule_search_locations: + for info in pkgutil.walk_packages( + spec.submodule_search_locations, f"{spec.name}.", onerror + ): + mod = importlib.import_module(info.name) + yield mod + + +def consume_modules(*modules: str) -> None: + """This recursively imports all packages/modules so that the ``@handle_urls`` + annotation are properly discovered and imported. + + Let's take a look at an example: + + .. code-block:: python + + # FILE: my_page_obj_project/load_rules.py + + from web_poet import default_registry, consume_modules + + consume_modules("other_external_pkg.po", "another_pkg.lib") + rules = default_registry.get_overrides() + + For this case, the :class:`~.ApplyRule` are coming from: + + - ``my_page_obj_project`` `(since it's the same module as the file above)` + - ``other_external_pkg.po`` + - ``another_pkg.lib`` + - any other modules that was imported in the same process inside the + packages/modules above. + + If the ``default_registry`` had other ``@handle_urls`` annotations outside + of the packages/modules listed above, then the corresponding + :class:`~.ApplyRule` won't be returned. Unless, they were recursively + imported in some way similar to :func:`~.web_poet.rules.consume_modules`. + """ + + for module in modules: + gen = _walk_module(module) + + # Inspired by itertools recipe: https://docs.python.org/3/library/itertools.html + # Using a deque() results in a tiny bit performance improvement that list(). + deque(gen, maxlen=0) + + +OverrideRule = _create_deprecated_class("OverrideRule", ApplyRule, warn_once=False) From b1c7e14ebf07a0e3fb853f0679b85d3144e86cd3 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 21:00:21 +0800 Subject: [PATCH 11/46] =?UTF-8?q?rename=20PageObjectRegistry's=20methods:?= =?UTF-8?q?=20get=5Foverrides=20=E2=86=92=20get=5Frules,=20search=5Foverri?= =?UTF-8?q?des=20=E2=86=92=20search=5Frules?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.rst | 4 ++++ docs/intro/overrides.rst | 40 +++++++++++++++++++-------------------- tests/test_rules.py | 41 +++++++++++++++++++++++++++++----------- web_poet/rules.py | 29 +++++++++++++++++++++------- 4 files changed, 76 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5b6c3be0..7477ab9b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -28,6 +28,10 @@ Deprecations: * The ``from_override_rules`` method of ``PageObjectRegistry`` is now deprecated. Use ``from_apply_rules`` instead. * The ``web_poet.overrides`` is deprecated. Use ``web_poet.rules`` instead. +* The ``PageObjectRegistry.get_overrides`` is deprecated. + Use ``PageObjectRegistry.get_rules`` instead. +* The ``PageObjectRegistry.search_overrides`` is deprecated. + Use ``PageObjectRegistry.search_rules`` instead. 0.5.1 (2022-09-23) ------------------ diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index d2af5cd9..262422da 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -108,7 +108,7 @@ The code above declares that: Retrieving all available Overrides ---------------------------------- -The :meth:`~.PageObjectRegistry.get_overrides` method from the ``web_poet.default_registry`` +The :meth:`~.PageObjectRegistry.get_rules` method from the ``web_poet.default_registry`` allows retrieval of all :class:`~.ApplyRule` in the given registry. Following from our example above, using it would be: @@ -117,7 +117,7 @@ Following from our example above, using it would be: from web_poet import default_registry # Retrieves all ApplyRules that were registered in the registry - rules = default_registry.get_overrides() + rules = default_registry.get_rules() print(len(rules)) # 3 print(rules[0]) # ApplyRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) @@ -128,7 +128,7 @@ in the :class:`~.ApplyRule` to be written into ``web_poet.default_registry``. .. warning:: - :meth:`~.PageObjectRegistry.get_overrides` relies on the fact that all essential + :meth:`~.PageObjectRegistry.get_rules` relies on the fact that all essential packages/modules which contains the :func:`web_poet.handle_urls` annotations are properly loaded `(i.e imported)`. @@ -145,7 +145,7 @@ in the :class:`~.ApplyRule` to be written into ``web_poet.default_registry``. from web_poet import default_registry, consume_modules consume_modules("external_package_A.po", "another_ext_package.lib") - rules = default_registry.get_overrides() + rules = default_registry.get_rules() The next section explores this caveat further. @@ -176,7 +176,7 @@ Let's suppose we have the following use case before us: Remember that all of the :class:`~.ApplyRule` are declared by annotating Page Objects using the :func:`web_poet.handle_urls` via ``@handle_urls``. Thus, -they can easily be accessed using the :meth:`~.PageObjectRegistry.get_overrides` +they can easily be accessed using the :meth:`~.PageObjectRegistry.get_rules` of ``web_poet.default_registry``. This can be done something like: @@ -187,13 +187,13 @@ This can be done something like: # ❌ Remember that this wouldn't retrieve any rules at all since the # annotations are NOT properly imported. - rules = default_registry.get_overrides() + rules = default_registry.get_rules() print(rules) # [] # ✅ Instead, you need to run the following so that all of the Page # Objects in the external packages are recursively imported. consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") - rules = default_registry.get_overrides() + rules = default_registry.get_rules() # The collected rules would then be as follows: print(rules) @@ -216,11 +216,11 @@ Using only a subset of the available ApplyRules Suppose that the use case from the previous section has changed wherein a subset of :class:`~.ApplyRule` would be used. This could be achieved by -using the :meth:`~.PageObjectRegistry.search_overrides` method which allows for +using the :meth:`~.PageObjectRegistry.search_rules` method which allows for convenient selection of a subset of rules from a given registry. Here's an example of how you could manually select the rules using the -:meth:`~.PageObjectRegistry.search_overrides` method instead: +:meth:`~.PageObjectRegistry.search_rules` method instead: .. code-block:: python @@ -229,12 +229,12 @@ Here's an example of how you could manually select the rules using the consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") - ecom_rules = default_registry.search_overrides(instead_of=ecommerce_page_objects.EcomGenericPage) + ecom_rules = default_registry.search_rules(instead_of=ecommerce_page_objects.EcomGenericPage) print(ecom_rules) # ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) # ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) - gadget_rules = default_registry.search_overrides(use=gadget_sites_page_objects.site_3.GadgetSite3) + gadget_rules = default_registry.search_rules(use=gadget_sites_page_objects.site_3.GadgetSite3) print(gadget_rules) # ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) @@ -244,7 +244,7 @@ Here's an example of how you could manually select the rules using the # ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) # ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) -As you can see, using the :meth:`~.PageObjectRegistry.search_overrides` method allows you to +As you can see, using the :meth:`~.PageObjectRegistry.search_rules` method allows you to conveniently select for :class:`~.ApplyRule` which conform to a specific criteria. This allows you to conveniently drill down to which :class:`~.ApplyRule` you're interested in using. @@ -285,7 +285,7 @@ have the first approach as an example: import ecommerce_page_objects, gadget_sites_page_objects consume_modules("ecommerce_page_objects", "gadget_sites_page_objects") - rules = default_registry.get_overrides() + rules = default_registry.get_rules() # The collected rules would then be as follows: print(rules) @@ -299,7 +299,7 @@ have the first approach as an example: def to_item(self): ... # call super().to_item() and improve on the item's shortcomings - rules = default_registry.get_overrides() + rules = default_registry.get_rules() print(rules) # 1. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) @@ -430,7 +430,7 @@ set of rules than expected. This **inclusion**-list approach can be done by importing the Page Objects directly and creating instances of :class:`~.ApplyRule` from it. You could also import -all of the available :class:`~.ApplyRule` using :meth:`~.PageObjectRegistry.get_overrides` +all of the available :class:`~.ApplyRule` using :meth:`~.PageObjectRegistry.get_rules` to sift through the list of available rules and manually selecting the rules you need. Most of the time, the needed rules are the ones which uses the Page Objects we're @@ -450,9 +450,9 @@ easily find the Page Object's rule using its `key`. Here's an example: default_registry[package_C.PageObject3], # ApplyRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}) ] -Another approach would be using the :meth:`~.PageObjectRegistry.search_overrides` +Another approach would be using the :meth:`~.PageObjectRegistry.search_rules` functionality as described from this tutorial section: :ref:`intro-rule-subset`. -The :meth:`~.PageObjectRegistry.search_overrides` is quite useful in cases wherein +The :meth:`~.PageObjectRegistry.search_rules` is quite useful in cases wherein the **POP** contains a lot of rules as it presents a utility for programmatically searching for them. @@ -466,15 +466,15 @@ Here's an example: consume_modules("package_A", "package_B", "package_C") - rule_from_A = default_registry.search_overrides(use=package_A.PageObject1) + rule_from_A = default_registry.search_rules(use=package_A.PageObject1) print(rule_from_A) # [ApplyRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={})] - rule_from_B = default_registry.search_overrides(instead_of=GenericProductPage) + rule_from_B = default_registry.search_rules(instead_of=GenericProductPage) print(rule_from_B) # [] - rule_from_C = default_registry.search_overrides(for_patterns=Patterns(include=["site_C.com"])) + rule_from_C = default_registry.search_rules(for_patterns=Patterns(include=["site_C.com"])) print(rule_from_C) # [ # ApplyRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=, instead_of=, to_return=None, meta={}), diff --git a/tests/test_rules.py b/tests/test_rules.py index 8597e064..f916efe2 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -85,13 +85,13 @@ def test_apply_rule_uniqueness() -> None: def test_list_page_objects_all() -> None: - rules = default_registry.get_overrides() + rules = default_registry.get_rules() page_objects = {po.use for po in rules} # Note that the 'tests_extra.po_lib_sub_not_imported.POLibSubNotImported' # Page Object is not included here since it was never imported anywhere in # our test package. It would only be included if we run any of the following - # below. (Note that they should run before `get_overrides` is called.) + # below. (Note that they should run before `get_rules` is called.) # - from tests_extra import po_lib_sub_not_imported # - import tests_extra.po_lib_sub_not_imported # - web_poet.consume_modules("tests_extra") @@ -111,6 +111,15 @@ def test_list_page_objects_all() -> None: assert rule.meta == rule.use.expected_meta, rule.use # type: ignore[attr-defined] +def test_registry_get_overrides_deprecation() -> None: + msg = "The 'get_overrides' method is deprecated. Use 'get_rules' instead." + with pytest.warns(DeprecationWarning, match=msg): + rules = default_registry.get_overrides() + + # It should still work as usual + assert len(rules) == 14 + + def test_consume_module_not_existing() -> None: with pytest.raises(ImportError): consume_modules("this_does_not_exist") @@ -121,25 +130,35 @@ def test_list_page_objects_all_consume() -> None: load the @handle_urls annotations from other modules/packages. """ consume_modules("tests_extra") - rules = default_registry.get_overrides() + rules = default_registry.get_rules() page_objects = {po.use for po in rules} assert any(["po_lib_sub_not_imported" in po.__module__ for po in page_objects]) -def test_registry_search_overrides() -> None: - rules = default_registry.search_overrides(use=POTopLevel2) +def test_registry_search_rules() -> None: + rules = default_registry.search_rules(use=POTopLevel2) assert len(rules) == 1 assert rules[0].use == POTopLevel2 - rules = default_registry.search_overrides(instead_of=POTopLevelOverriden2) + rules = default_registry.search_rules(instead_of=POTopLevelOverriden2) assert len(rules) == 1 assert rules[0].instead_of == POTopLevelOverriden2 # Such rules doesn't exist - rules = default_registry.search_overrides(use=POModuleOverriden) + rules = default_registry.search_rules(use=POModuleOverriden) assert len(rules) == 0 +def test_registry_search_overrides_deprecation() -> None: + msg = "The 'search_overrides' method is deprecated. Use 'search_rules' instead." + with pytest.warns(DeprecationWarning, match=msg): + rules = default_registry.search_overrides(use=POTopLevel2) + + # It should still work as usual + assert len(rules) == 1 + assert rules[0].use == POTopLevel2 + + def test_from_apply_rules() -> None: rules = [ ApplyRule( @@ -151,8 +170,8 @@ def test_from_apply_rules() -> None: registry = PageObjectRegistry.from_apply_rules(rules) - assert registry.get_overrides() == rules - assert default_registry.get_overrides() != rules + assert registry.get_rules() == rules + assert default_registry.get_rules() != rules def test_from_override_rules_deprecation() -> None: @@ -171,8 +190,8 @@ def test_from_override_rules_deprecation() -> None: with pytest.warns(DeprecationWarning, match=msg): registry = PageObjectRegistry.from_override_rules(rules) - assert registry.get_overrides() == rules - assert default_registry.get_overrides() != rules + assert registry.get_rules() == rules + assert default_registry.get_rules() != rules def test_handle_urls_deprecation() -> None: diff --git a/web_poet/rules.py b/web_poet/rules.py index 519fcf82..55787a35 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -27,7 +27,7 @@ class ApplyRule: This is instantiated when using the :func:`web_poet.handle_urls` decorator. It's also being returned as a ``List[ApplyRule]`` when calling the - ``web_poet.default_registry``'s :meth:`~.PageObjectRegistry.get_overrides` + ``web_poet.default_registry``'s :meth:`~.PageObjectRegistry.get_rules` method. You can access any of its attributes: @@ -79,7 +79,7 @@ class PageObjectRegistry(dict): class ExampleComProductPage(WebPage): ... - override_rules = default_registry.get_overrides() + override_rules = default_registry.get_rules() Notice that the ``@handle_urls`` that we're using is a part of the ``default_registry``. This provides a shorter and quicker way to interact @@ -200,7 +200,7 @@ def wrapper(cls): return wrapper - def get_overrides(self) -> List[ApplyRule]: + def get_rules(self) -> List[ApplyRule]: """Returns all of the :class:`~.ApplyRule` that were declared using the ``@handle_urls`` annotation. @@ -212,7 +212,13 @@ def get_overrides(self) -> List[ApplyRule]: """ return list(self.values()) - def search_overrides(self, **kwargs) -> List[ApplyRule]: + def get_overrides(self) -> List[ApplyRule]: + """Deprecated, use :meth:`~.PageObjectRegistry.get_rules` instead.""" + msg = "The 'get_overrides' method is deprecated. Use 'get_rules' instead." + warnings.warn(msg, DeprecationWarning, stacklevel=2) + return self.get_rules() + + def search_rules(self, **kwargs) -> List[ApplyRule]: """Returns any :class:`ApplyRule` that has any of its attributes match the rules inside the registry. @@ -220,7 +226,7 @@ def search_overrides(self, **kwargs) -> List[ApplyRule]: .. code-block:: python - rules = registry.search_overrides(use=ProductPO, instead_of=GenericPO) + rules = registry.search_rules(use=ProductPO, instead_of=GenericPO) print(len(rules)) # 1 """ @@ -244,11 +250,20 @@ def matcher(rule: ApplyRule): return False results = [] - for rule in self.get_overrides(): + for rule in self.get_rules(): if matcher(rule): results.append(rule) return results + def search_overrides(self, **kwargs) -> List[ApplyRule]: + """Deprecated, use :meth:`~.PageObjectRegistry.search_rules` instead.""" + msg = ( + "The 'search_overrides' method is deprecated. " + "Use 'search_rules' instead." + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + return self.search_rules(**kwargs) + def _walk_module(module: str) -> Iterable: """Return all modules from a module recursively. @@ -286,7 +301,7 @@ def consume_modules(*modules: str) -> None: from web_poet import default_registry, consume_modules consume_modules("other_external_pkg.po", "another_pkg.lib") - rules = default_registry.get_overrides() + rules = default_registry.get_rules() For this case, the :class:`~.ApplyRule` are coming from: From 1afddc9be54218d6394b0c377cb65d3975964bcb Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 22:13:14 +0800 Subject: [PATCH 12/46] import * from 'rules' in 'overrides' --- web_poet/overrides.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/web_poet/overrides.py b/web_poet/overrides.py index a37db47e..f80db428 100644 --- a/web_poet/overrides.py +++ b/web_poet/overrides.py @@ -1,4 +1,6 @@ import warnings +from web_poet.rules import * # noqa: F401, F403 + msg = "The 'web_poet.overrides' module has been moved into 'web_poet.rules'." warnings.warn(msg, DeprecationWarning, stacklevel=2) From 144e39ef36d5db891686bae8137f723ba4e1437f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 3 Oct 2022 23:25:20 +0800 Subject: [PATCH 13/46] prioritize 'to_return' parameter compared to derived item_cls --- tests/po_lib_to_return/__init__.py | 8 ++++---- web_poet/_typing.py | 9 +++++++++ web_poet/pages.py | 7 ++----- web_poet/rules.py | 10 +++------- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index c6bcd752..38c5f3d9 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -112,13 +112,13 @@ class CustomProductPage(ProductPage, Returns[Product]): This PO is the same with ``SimilarProductPage`` but passes a ``to_return`` in the ``@handle_urls`` decorator. - This tests the case that the type inside ``Returns`` should be followed and - the ``to_return`` parameter from ``@handle_urls`` is ignored. + This tests the case that the type passed via the ``to_return`` parameter + from ``@handle_urls`` takes priority. """ expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) - expected_to_return = Product + expected_to_return = ProductSimilar expected_meta = {} @@ -130,7 +130,7 @@ class CustomProductPageNoReturns(ProductPage): expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) - expected_to_return = Product + expected_to_return = ProductSimilar expected_meta = {} diff --git a/web_poet/_typing.py b/web_poet/_typing.py index 0c96e2ff..9455ea28 100644 --- a/web_poet/_typing.py +++ b/web_poet/_typing.py @@ -22,3 +22,12 @@ def get_generic_parameter(cls): if is_generic_alias(base): args = _get_args(base) return args[0] + + +def get_item_cls(cls, default=None, preferred=None): + if preferred: + return preferred + param = get_generic_parameter(cls) + if param is None or isinstance(param, typing.TypeVar): # class is not parametrized + return default + return param diff --git a/web_poet/pages.py b/web_poet/pages.py index 5d268759..ec2f2a57 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -3,7 +3,7 @@ import attr -from web_poet._typing import get_generic_parameter +from web_poet._typing import get_item_cls from web_poet.fields import FieldsMixin, item_from_fields from web_poet.mixins import ResponseShortcutsMixin from web_poet.page_inputs import HttpResponse @@ -47,10 +47,7 @@ class Returns(typing.Generic[ItemT]): @property def item_cls(self) -> typing.Type[ItemT]: """Item class""" - param = get_generic_parameter(self.__class__) - if isinstance(param, typing.TypeVar): # class is not parametrized - return dict # type: ignore[return-value] - return param + return get_item_cls(self.__class__, default=dict) class ItemPage(Injectable, Returns[ItemT]): diff --git a/web_poet/rules.py b/web_poet/rules.py index 55787a35..c394da55 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -11,8 +11,8 @@ from url_matcher import Patterns -from web_poet._typing import get_generic_parameter -from web_poet.pages import ItemPage, ItemT +from web_poet._typing import get_item_cls +from web_poet.pages import ItemPage from web_poet.utils import _create_deprecated_class, as_list Strings = Union[str, Iterable[str]] @@ -162,10 +162,6 @@ def handle_urls( """ def wrapper(cls): - final_data_type = to_return - derived_type = get_generic_parameter(cls) - if derived_type and derived_type != ItemT: - final_data_type = derived_type if overrides is not None: msg = ( @@ -182,7 +178,7 @@ def wrapper(cls): ), use=cls, instead_of=instead_of, - to_return=final_data_type, + to_return=get_item_cls(cls, preferred=to_return), meta=kwargs, ) # If it was already defined, we don't want to override it From fee63a56b1ce963cba82beff2b8ae4e69c8d155a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 6 Oct 2022 13:51:28 +0800 Subject: [PATCH 14/46] fix the deprecated 'overrides' parameter not being used if present --- web_poet/rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index c394da55..171483b9 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -177,7 +177,7 @@ def wrapper(cls): priority=priority, ), use=cls, - instead_of=instead_of, + instead_of=instead_of or overrides, to_return=get_item_cls(cls, preferred=to_return), meta=kwargs, ) From 33a48ac967176218dd2552c97e916a9d54279212 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 6 Oct 2022 17:12:53 +0800 Subject: [PATCH 15/46] enable auto-conversion to url_matcher.Patterns on ApplyRules.for_patterns --- CHANGELOG.rst | 13 ++++++++----- tests/test_rules.py | 32 ++++++++++++++++++++++++++++++++ web_poet/rules.py | 10 +++++----- web_poet/utils.py | 9 ++++++++- 4 files changed, 53 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7477ab9b..7aea6157 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,16 +5,19 @@ Changelog TBD --- -* New ``ApplyRule`` class created by the ``@handle_urls`` decorator. - This is nearly identical with ``OverrideRule`` except that it's accepting - a ``to_return`` parameter which signifies the data container class that - the Page Object returns. +* New ``ApplyRule`` class created by the ``@handle_urls`` decorator: + + * This is nearly identical with ``OverrideRule`` except that it's accepting + a ``to_return`` parameter which signifies the data container class that + the Page Object returns. + * Passing a string to ``for_patterns`` would auto-convert it into + ``url_matcher.Patterns``. * Modify the call signature of ``handle_urls``: * New ``to_return`` parameter which signifies the data container class that the Page Object returns. This behaves exactly the same with ``overrides`` - but it's more consistent with the attribute names of ``ApplyRules`` that + but it's more consistent with the attribute names of ``ApplyRules`` that it creates. * New ``instead_of`` parameter which does the same thing with ``overrides``. * The old ``overrides`` parameter is not required anymore as it's set for diff --git a/tests/test_rules.py b/tests/test_rules.py index f916efe2..04b379d1 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -1,5 +1,6 @@ import warnings +import attrs import pytest from url_matcher import Patterns @@ -84,6 +85,37 @@ def test_apply_rule_uniqueness() -> None: assert hash(rule1) != hash(rule2) +def test_apply_rule_immutability() -> None: + patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) + + rule = ApplyRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + ) + + with pytest.raises(attrs.exceptions.FrozenInstanceError): + rule.use = POModule # type: ignore[misc] + + +def test_apply_rule_converter_on_pattern() -> None: + # passing strings should auto-converter into Patterns + rule = ApplyRule("example.com", use=POTopLevel1, instead_of=POTopLevelOverriden2) + assert rule.for_patterns == Patterns( + include=("example.com",), exclude=(), priority=500 + ) + + # Passing Patterns should still work + rule = ApplyRule( + for_patterns=Patterns(["example.com"]), + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + ) + assert rule.for_patterns == Patterns( + include=("example.com",), exclude=(), priority=500 + ) + + def test_list_page_objects_all() -> None: rules = default_registry.get_rules() page_objects = {po.use for po in rules} diff --git a/web_poet/rules.py b/web_poet/rules.py index 171483b9..5d4212d7 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -5,22 +5,22 @@ import pkgutil import warnings from collections import deque -from dataclasses import dataclass, field from operator import attrgetter from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar, Union +import attrs from url_matcher import Patterns from web_poet._typing import get_item_cls from web_poet.pages import ItemPage -from web_poet.utils import _create_deprecated_class, as_list +from web_poet.utils import _create_deprecated_class, as_list, str_to_pattern Strings = Union[str, Iterable[str]] PageObjectRegistryTV = TypeVar("PageObjectRegistryTV", bound="PageObjectRegistry") -@dataclass(frozen=True) +@attrs.define(frozen=True) class ApplyRule: """A single override rule that specifies when a Page Object should be used in lieu of another. @@ -48,11 +48,11 @@ class ApplyRule: unique rules and identify any duplicates. """ - for_patterns: Patterns + for_patterns: Patterns = attrs.field(converter=str_to_pattern) use: Type[ItemPage] instead_of: Optional[Type[ItemPage]] = None to_return: Optional[Type[Any]] = None - meta: Dict[str, Any] = field(default_factory=dict) + meta: Dict[str, Any] = attrs.field(factory=dict) def __hash__(self): return hash((self.for_patterns, self.use, self.instead_of, self.to_return)) diff --git a/web_poet/utils.py b/web_poet/utils.py index 00fe8166..14b5ce05 100644 --- a/web_poet/utils.py +++ b/web_poet/utils.py @@ -3,10 +3,11 @@ from collections.abc import Iterable from functools import lru_cache, wraps from types import MethodType -from typing import Any, List, Optional +from typing import Any, List, Optional, Union from warnings import warn from async_lru import alru_cache +from url_matcher import Patterns def _clspath(cls, forced=None): @@ -230,3 +231,9 @@ async def ensure_awaitable(obj): if inspect.isawaitable(obj): return await obj return obj + + +def str_to_pattern(url_pattern: Union[str, Patterns]) -> Patterns: + if isinstance(url_pattern, Patterns): + return url_pattern + return Patterns([url_pattern]) From 9b6f9c4d836b0335850c076e1eed66374d18ecc6 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 7 Oct 2022 16:39:12 +0800 Subject: [PATCH 16/46] update all arguments of ApplyRule to be keyword-only except 'for_patterns' --- CHANGELOG.rst | 9 +++++---- tests/test_rules.py | 20 ++++++++++++++++++++ web_poet/rules.py | 8 ++++---- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7aea6157..b26a043a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,13 +5,14 @@ Changelog TBD --- -* New ``ApplyRule`` class created by the ``@handle_urls`` decorator: +* New ``ApplyRule`` class created by the ``@handle_urls`` decorator. This is + nearly identical with ``OverrideRule`` except that: - * This is nearly identical with ``OverrideRule`` except that it's accepting - a ``to_return`` parameter which signifies the data container class that - the Page Object returns. + * It's now accepting a ``to_return`` parameter which signifies the data + container class that the Page Object returns. * Passing a string to ``for_patterns`` would auto-convert it into ``url_matcher.Patterns``. + * All arguments are now keyword-only except for ``for_patterns``. * Modify the call signature of ``handle_urls``: diff --git a/tests/test_rules.py b/tests/test_rules.py index 04b379d1..793d9a16 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -116,6 +116,26 @@ def test_apply_rule_converter_on_pattern() -> None: ) +def test_apply_rule_kwargs_only() -> None: + + params = { + "use": POTopLevel1, + "instead_of": POTopLevelOverriden2, + "to_return": Product, + "meta": {"key_2": 2}, + } + remove = set() + + for param_name in params: + remove.add(param_name) + with pytest.raises(TypeError): + ApplyRule( + "example.com", + *[params[r] for r in remove], + **{k: v for k, v in params.items() if k not in remove} + ) + + def test_list_page_objects_all() -> None: rules = default_registry.get_rules() page_objects = {po.use for po in rules} diff --git a/web_poet/rules.py b/web_poet/rules.py index 5d4212d7..c78a92cb 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -49,10 +49,10 @@ class ApplyRule: """ for_patterns: Patterns = attrs.field(converter=str_to_pattern) - use: Type[ItemPage] - instead_of: Optional[Type[ItemPage]] = None - to_return: Optional[Type[Any]] = None - meta: Dict[str, Any] = attrs.field(factory=dict) + use: Type[ItemPage] = attrs.field(kw_only=True) + instead_of: Optional[Type[ItemPage]] = attrs.field(default=None, kw_only=True) + to_return: Optional[Type[Any]] = attrs.field(default=None, kw_only=True) + meta: Dict[str, Any] = attrs.field(factory=dict, kw_only=True) def __hash__(self): return hash((self.for_patterns, self.use, self.instead_of, self.to_return)) From 82645650a9cc2d977e60ca11a75725508ffae4c8 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 7 Oct 2022 16:41:46 +0800 Subject: [PATCH 17/46] fix mypy issue in ApplyRule tests --- tests/test_rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index 793d9a16..30d83d25 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -132,7 +132,7 @@ def test_apply_rule_kwargs_only() -> None: ApplyRule( "example.com", *[params[r] for r in remove], - **{k: v for k, v in params.items() if k not in remove} + **{k: v for k, v in params.items() if k not in remove} # type: ignore[arg-type] ) From 3287881a2bd8364950a825acbe1d294a9a91f08f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 14 Oct 2022 17:58:01 +0800 Subject: [PATCH 18/46] improve tests --- tests/po_lib/__init__.py | 3 ++- tests/po_lib_to_return/__init__.py | 14 ++++++++++++++ tests/test_rules.py | 22 ++++++++++++---------- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 91c8397e..04f15a32 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -26,7 +26,8 @@ class POTopLevelOverriden2(ItemPage): ... -# This first annotation is ignored. A single annotation per registry is allowed +# This first decorator is ignored. A single ``ApplyRule`` with the same Page +# Object to be used per registry is allowed. @handle_urls("example.com", instead_of=POTopLevelOverriden1) @handle_urls( "example.com", instead_of=POTopLevelOverriden1, exclude="/*.jpg|", priority=300 diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index 38c5f3d9..d0c40eea 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -26,6 +26,20 @@ class ProductLessFields: name: str +@handle_urls("example.com") +class SomePage(ItemPage): + """A PO which is only marked by the URL pattern.""" + + expected_instead_of = None + expected_patterns = Patterns(["example.com"]) + expected_to_return = None + expected_meta = {} + + @field + def name(self) -> str: + return "some name" + + @handle_urls("example.com") class ProductPage(ItemPage[Product]): """A base PO to populate the Product item's fields.""" diff --git a/tests/test_rules.py b/tests/test_rules.py index 30d83d25..2e9745f8 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -20,6 +20,7 @@ ProductPage, ProductSimilar, SimilarProductPage, + SomePage, ) from web_poet import ( ApplyRule, @@ -31,19 +32,20 @@ ) POS = { + CustomProductPage, + CustomProductPageNoReturns, + CustomProductPageDataTypeOnly, + ImprovedProductPage, + LessProductPage, + MoreProductPage, POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule, ProductPage, - ImprovedProductPage, SimilarProductPage, - MoreProductPage, - LessProductPage, - CustomProductPage, - CustomProductPageNoReturns, - CustomProductPageDataTypeOnly, + SomePage, } @@ -81,7 +83,7 @@ def test_apply_rule_uniqueness() -> None: instead_of=POTopLevelOverriden2, to_return=ProductSimilar, ) - # A different data type class results in different hash. + # A different item class results in different hash. assert hash(rule1) != hash(rule2) @@ -152,7 +154,7 @@ def test_list_page_objects_all() -> None: assert all(["po_lib_sub_not_imported" not in po.__module__ for po in page_objects]) # Ensure that ALL Override Rules are returned as long as the given - # registry's @handle_urls annotation was used. + # registry's @handle_urls decorator was used. assert page_objects == POS.union({POLibSub}) for rule in rules: # We're ignoring the types below since mypy expects ``Type[ItemPage]`` @@ -169,7 +171,7 @@ def test_registry_get_overrides_deprecation() -> None: rules = default_registry.get_overrides() # It should still work as usual - assert len(rules) == 14 + assert len(rules) == 15 def test_consume_module_not_existing() -> None: @@ -179,7 +181,7 @@ def test_consume_module_not_existing() -> None: def test_list_page_objects_all_consume() -> None: """A test similar to the one above but calls ``consume_modules()`` to properly - load the @handle_urls annotations from other modules/packages. + load the ``@handle_urls`` decorators from other modules/packages. """ consume_modules("tests_extra") rules = default_registry.get_rules() From c5faf38153c0c11d37cdec90c2d84a0e89be1dfa Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 14 Oct 2022 18:00:37 +0800 Subject: [PATCH 19/46] update docstrings/tutorials regarding the new 'to_return' parameter --- docs/advanced/fields.rst | 10 ++- docs/intro/overrides.rst | 183 ++++++++++++++++++++++++++++++++++----- web_poet/pages.py | 2 +- web_poet/rules.py | 60 ++++++++----- 4 files changed, 207 insertions(+), 48 deletions(-) diff --git a/docs/advanced/fields.rst b/docs/advanced/fields.rst index f0b7c827..77ecf05e 100644 --- a/docs/advanced/fields.rst +++ b/docs/advanced/fields.rst @@ -151,6 +151,8 @@ function when accessing the fields: Now any field can be converted from sync to async, or the other way around, and the code would keep working. +.. _`item-classes`: + Item classes ------------ @@ -240,8 +242,8 @@ indicating that a required argument is missing. Without an item class, none of these errors are detected. -Changing Item type -~~~~~~~~~~~~~~~~~~ +Changing Item Class +~~~~~~~~~~~~~~~~~~~ Let's say there is a Page Object implemented, which outputs some standard item. Maybe there is a library of such Page Objects available. But for a @@ -308,7 +310,7 @@ to the item: # ... Note how :class:`~.Returns` is used as one of the base classes of -``CustomFooPage``; it allows to change the item type returned by a page object. +``CustomFooPage``; it allows to change the item class returned by a page object. Removing fields (as well as renaming) is a bit more tricky. @@ -343,7 +345,7 @@ is passed, and ``name`` is the only field ``CustomItem`` supports. To recap: -* Use ``Returns[NewItemType]`` to change the item type in a subclass. +* Use ``Returns[NewItemType]`` to change the item class in a subclass. * Don't use ``skip_nonitem_fields=True`` when your Page Object corresponds to an item exactly, or when you're only adding fields. This is a safe approach, which allows to detect typos in field names, even for optional diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 262422da..9abbbc39 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -3,13 +3,17 @@ Overrides ========= -Overrides contains mapping rules to associate which URLs a particular -Page Object would be used. The URL matching rules is handled by another library -called `url-matcher `_. +Overrides are simply rules represented by a list of :class:`~.ApplyRule` which +associates which URL patterns a particular Page Object would be used. The URL +matching rules is handled by another library called +`url-matcher `_. Using such rules establishes the core concept of Overrides wherein a developer -could declare that a specific Page Object must be used *(instead of another)* -for a given set of URL patterns. +could declare that for a given set of URL patterns a specific Page Object must +be used: + + * instead of another Page Object, or + * which returns an Item Class instead of some other Item Class. This enables **web-poet** to be used effectively by other frameworks like `scrapy-poet `_. @@ -27,7 +31,7 @@ going to crawl beforehand. However, we could at least create a generic Page Object to support parsing of some fields in well-known locations of product information like ````. This enables our broadcrawler to at least parse some useful information. Let's -call such Page Object to be ``GenericProductPage``. +call such a Page Object to be ``GenericProductPage``. Assuming that one of our project requirements is to fully support parsing of the `top 3 eCommerce websites`, then we'd need to create a Page Object for each one @@ -50,6 +54,28 @@ Let's see this in action by declaring the Overrides in the Page Objects below. Creating Overrides ------------------ +To simplify the code examples in the next few subsections, let's consider that +these Item Classes has been predefined. + +.. code-block:: python + + import attrs + + + @attrs.define + class Product: + product_title: str + regular_price: float + + + @attrs.define + class SimilarProduct: + product_title: str + regular_price: float + +Page Object +~~~~~~~~~~~ + Let's take a look at how the following code is structured: .. code-block:: python @@ -58,29 +84,32 @@ Let's take a look at how the following code is structured: class GenericProductPage(WebPage): - def to_item(self): - return {"product-title": self.css("title::text").get()} + def to_item(self) -> Product: + return Product(product_title=self.css("title::text").get()) @handle_urls("example.com", instead_of=GenericProductPage) class ExampleProductPage(WebPage): - def to_item(self): + def to_item(self) -> Product: ... # more specific parsing @handle_urls("anotherexample.com", instead_of=GenericProductPage, exclude="/digital-goods/") class AnotherExampleProductPage(WebPage): - def to_item(self): + def to_item(self) -> Product: ... # more specific parsing @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], instead_of=GenericProductPage) class DualExampleProductPage(WebPage): - def to_item(self): + def to_item(self) -> SimilarProduct: ... # more specific parsing The code above declares that: + - Page Objects return ``Product`` and ``SimilarProduct`` item class. Returning + item classes is a preferred approach as explained in the :ref:`web-poet-fields` + section. - For sites that match the ``example.com`` pattern, ``ExampleProductPage`` would be used instead of ``GenericProductPage``. - The same is true for ``DualExampleProductPage`` where it is used @@ -100,17 +129,128 @@ The code above declares that: .. tip:: - The URL patterns declared in the ``@handle_urls`` annotation can still be + The URL patterns declared in the ``@handle_urls`` decorator can still be further customized. You can read some of the specific parameters in the :ref:`API section <api-overrides>` of :func:`web_poet.handle_urls`. +Item Class +~~~~~~~~~~ + +An alternative approach for the Page Object Overrides example above is to specify +the returned Item Class. For example, we could change the previous example into +the following: + + +.. code-block:: python + + from web_poet import handle_urls, WebPage, Returns + + + class GenericProductPage(WebPage, Returns[Product]): + def to_item(self) -> Product: + return Product(product_title=self.css("title::text").get()) + + + @handle_urls("example.com", to_return=Product) + class ExampleProductPage(WebPage): + def to_item(self) -> Product: + ... # more specific parsing + + + @handle_urls("anotherexample.com", to_return=Product, exclude="/digital-goods/") + class AnotherExampleProductPage(WebPage): + def to_item(self) -> Product: + ... # more specific parsing + + + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], to_return=Product) + class DualExampleProductPage(WebPage): + def to_item(self) -> SimilarProduct: + ... # more specific parsing + +Let's break this example down: + + - The URL patterns are exactly the same as with the previous code example. + - The ``instead_of`` parameter has been replaced with ``to_return`` which + accepts an Item Class. This means that: + + - If a ``Product`` Item Class is requested for URLs matching with the + "example.com" pattern, then the ``Product`` Item Class would come from + the ``to_item()`` method of ``ExampleProductPage``. + - Similarly, if the URL matches with "anotherexample.com" without the + "/digital-goods/" path, then the ``Product`` Item Class comes from the + ``AnotherExampleProductPage`` Page Object. + - However, if a ``Product`` Item Class is requested matching with the URL + pattern of "dualexample.com/shop/?product=*", a ``SimilarProduct`` + Item Class is returned by the ``DualExampleProductPage``'s ``to_item()`` + method instead. + +This has the same concept as with the Page Object Overrides but the context changes +from the Page Object into the Item Classes returned by the ``to_item()`` method +instead. + +The upside here is that you're able to directly access the item extracted by the +Page Object for the given site. There's no need to directly call the ``to_item()`` +method to receive it. + +However, one downside to this approach is that you lose access to the actual Page +Object. Aside from the item extracted by the Page Object, there might be some +other convenience methods or other data from it that you want to access. + + +.. _`combination`: + +Combination +~~~~~~~~~~~ + +Of course, you can use the combination of both which enables you to specify in +either contexts of Page Objects and Item Classes. + +.. code-block:: python + + from web_poet import handle_urls, WebPage, Returns + + + class GenericProductPage(WebPage, Returns[Product]): + def to_item(self) -> Product: + return Product(product_title=self.css("title::text").get()) + + + @handle_urls("example.com", instead_of=GenericProductPage, to_return=Product) + class ExampleProductPage(WebPage): + def to_item(self) -> Product: + ... # more specific parsing + + + @handle_urls("anotherexample.com", instead_of=GenericProductPage, exclude="/digital-goods/") + class AnotherExampleProductPage(WebPage, Returns[Product]): + def to_item(self) -> Product: + ... # more specific parsing + + + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], instead_of=GenericProductPage) + class DualExampleProductPage(WebPage, Returns[SimilarProduct]): + def to_item(self) -> SimilarProduct: + ... # more specific parsing + +Notice that the ``@handle_urls`` decorator for ``AnotherExampleProductPage`` and +``DualExampleProductPage`` doesn't specify the ``to_return`` parameter at all. +This is because it's able to directly derive the returned Item Class from the +respective ``Returns[Product]`` and ``Returns[SimilarProduct]`` declarations. + +See the next :ref:`retrieving-overrides` section observe what are the actual +:class:`~.ApplyRule` that were created by the ``@handle_urls`` decorators. + + +.. _`retrieving-overrides`: Retrieving all available Overrides ---------------------------------- The :meth:`~.PageObjectRegistry.get_rules` method from the ``web_poet.default_registry`` -allows retrieval of all :class:`~.ApplyRule` in the given registry. -Following from our example above, using it would be: +allows retrieval of all :class:`~.ApplyRule` in the given registry. +Following from our example above in the :ref:`combination` section, using it +would be: .. code-block:: python @@ -119,8 +259,11 @@ Following from our example above, using it would be: # Retrieves all ApplyRules that were registered in the registry rules = default_registry.get_rules() - print(len(rules)) # 3 - print(rules[0]) # ApplyRule(for_patterns=Patterns(include=['example.com'], exclude=[], priority=500), use=<class 'my_project.page_objects.ExampleProductPage'>, instead_of=<class 'my_project.page_objects.GenericProductPage'>, to_return=None, meta={}) + for r in rules: + print(r) + # ApplyRule(for_patterns=Patterns(include=('example.com',), exclude=(), priority=500), use=<class 'ExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'Product'>, meta={}) + # ApplyRule(for_patterns=Patterns(include=('anotherexample.com',), exclude=('/digital-goods/',), priority=500), use=<class 'AnotherExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'Product'>, meta={}) + # ApplyRule(for_patterns=Patterns(include=('dualexample.com/shop/?product=*', 'dualexample.net/store/?pid=*'), exclude=(), priority=500), use=<class 'DualExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'SimilarProduct'>, meta={}) Remember that using ``@handle_urls`` to annotate the Page Objects would result in the :class:`~.ApplyRule` to be written into ``web_poet.default_registry``. @@ -130,10 +273,10 @@ in the :class:`~.ApplyRule` to be written into ``web_poet.default_registry``. :meth:`~.PageObjectRegistry.get_rules` relies on the fact that all essential packages/modules which contains the :func:`web_poet.handle_urls` - annotations are properly loaded `(i.e imported)`. + decorators are properly loaded `(i.e imported)`. Thus, for cases like importing and using Page Objects from other external packages, - the ``@handle_urls`` annotations from these external sources must be read and + the ``@handle_urls`` decorators from these external sources must be read and processed properly. This ensures that the external Page Objects have all of their :class:`~.ApplyRule` present. @@ -186,7 +329,7 @@ This can be done something like: from web_poet import default_registry, consume_modules # ❌ Remember that this wouldn't retrieve any rules at all since the - # annotations are NOT properly imported. + # ``@handle_urls`` decorators are NOT properly loaded. rules = default_registry.get_rules() print(rules) # [] @@ -389,7 +532,7 @@ more priority`). You don't necessarily need to `delete` the **old** Creating a new :class:`~.ApplyRule` with a higher priority could be as easy as: 1. Subclassing the Page Object in question. - 2. Create a new :func:`web_poet.handle_urls` annotation with the same URL + 2. Declare a new :func:`web_poet.handle_urls` decorator with the same URL pattern and Page Object to override but with a much higher priority. Here's an example: diff --git a/web_poet/pages.py b/web_poet/pages.py index ec2f2a57..77b39af3 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -41,7 +41,7 @@ def is_injectable(cls: typing.Any) -> bool: class Returns(typing.Generic[ItemT]): - """Inherit from this generic mixin to change the item type used by + """Inherit from this generic mixin to change the item class used by :class:`~.ItemPage`""" @property diff --git a/web_poet/rules.py b/web_poet/rules.py index c78a92cb..56e148ff 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -22,8 +22,8 @@ @attrs.define(frozen=True) class ApplyRule: - """A single override rule that specifies when a Page Object should be used - in lieu of another. + """A rule that primarily applies Page Object and Item overrides for a given + URL pattern. This is instantiated when using the :func:`web_poet.handle_urls` decorator. It's also being returned as a ``List[ApplyRule]`` when calling the @@ -36,11 +36,21 @@ class ApplyRule: this rule. You can read the API documentation of the `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more information about the patterns. - * ``use`` - The Page Object that will be **used**. - * ``instead_of`` - The Page Object that will be **replaced**. - * ``to_return`` - The data container class that the Page Object returns. - * ``meta`` - Any other information you may want to store. This doesn't - do anything for now but may be useful for future API updates. + * ``use`` - The Page Object that will be **used** in cases where the URL + pattern represented by the ``for_patterns`` attribute is matched. + * ``instead_of`` - *(optional)* The Page Object that will be **replaced** + with the Page Object specified via the ``use`` parameter. + * ``to_return`` - *(optional)* The data container class that will be + **replaced** by the item returned by the Page Object specified via the + ``use`` parameter. + * ``meta`` - *(optional)* Any other information you may want to store. + This doesn't do anything for now but may be useful for future API updates. + + The main functionality of this class lies in the ``instead_of`` and ``to_return`` + parameter. Should both of these be omitted, then :class:`~.ApplyRule` simply + tags which URL patterns the given Page Object is expected to be used. + + More information regarding its usage in :ref:`intro-overrides`. .. tip:: @@ -59,8 +69,8 @@ def __hash__(self): class PageObjectRegistry(dict): - """This contains the mapping rules that associates the Page Objects available - for a given URL matching rule. + """This contains the :class:`~.ApplyRule` that associates the Page Objects + alongside its Items for a given URL matching rule. Note that it's simply a ``dict`` subclass with added functionalities on storing, retrieving, and searching for the :class:`~.ApplyRule` instances. @@ -81,7 +91,7 @@ class ExampleComProductPage(WebPage): override_rules = default_registry.get_rules() - Notice that the ``@handle_urls`` that we're using is a part of the + Notice that the ``@handle_urls`` decorator that we're using is a part of the ``default_registry``. This provides a shorter and quicker way to interact with the built-in default :class:`~.PageObjectRegistry` instead of writing the longer ``@default_registry.handle_urls``. @@ -107,7 +117,8 @@ def from_apply_rules( instance by accepting a list of :class:`~.ApplyRule`. This is useful in cases wherein you need to store some selected rules - from multiple external packages. + from multiple external packages. See this :ref:`example + <overrides-custom-registry>`. """ return cls({rule.use: rule for rule in rules}) @@ -141,6 +152,9 @@ def handle_urls( The Page Object that is **overridden** is declared using the ``instead_of`` parameter. + The Item Class could also be **overridden** using the ``to_return`` + parameter. + The **override** mechanism only works on certain URLs that match the ``include`` and ``exclude`` parameters. See the documentation of the `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more @@ -151,9 +165,9 @@ def handle_urls( :param include: The URLs that should be handled by the decorated Page Object. :param instead_of: The Page Object that should be `replaced`. :param to_return: The Item Class holding the data returned by the Page Object. - This parameter is ignored if the Page Object has declared a data type - using :class:`~.Returns` or :class:`~.ItemPage` - (e.g. ``class ProductPageObject(ItemPage[ProductItem])``). + This could be omitted as it could be derived from the ``Returns[ItemClass]`` + or ``ItemPage[ItemClass]`` declaration of the Page Object. See + :ref:`item-classes` section. Code example in :ref:`combination` subsection. :param exclude: The URLs over which the override should **not** happen. :param priority: The resolution priority in case of `conflicting` rules. A conflict happens when the ``include``, ``override``, and ``exclude`` @@ -186,7 +200,7 @@ def wrapper(cls): self[cls] = rule else: warnings.warn( - f"Multiple @handle_urls annotations with the same 'instead_of' " + f"Multiple @handle_urls decorators for the same Page Object " f"are ignored in the same Registry. The following rule is " f"ignored:\n{rule}", stacklevel=2, @@ -197,14 +211,14 @@ def wrapper(cls): return wrapper def get_rules(self) -> List[ApplyRule]: - """Returns all of the :class:`~.ApplyRule` that were declared using - the ``@handle_urls`` annotation. + """Returns all the :class:`~.ApplyRule` that were declared using + the ``@handle_urls`` decorator. .. warning:: Remember to consider calling :func:`~.web_poet.rules.consume_modules` beforehand to recursively import all submodules which contains the - ``@handle_urls`` annotations from external Page Objects. + ``@handle_urls`` decorators from external Page Objects. """ return list(self.values()) @@ -286,7 +300,7 @@ def onerror(err): def consume_modules(*modules: str) -> None: """This recursively imports all packages/modules so that the ``@handle_urls`` - annotation are properly discovered and imported. + decorators are properly discovered and imported. Let's take a look at an example: @@ -307,10 +321,10 @@ def consume_modules(*modules: str) -> None: - any other modules that was imported in the same process inside the packages/modules above. - If the ``default_registry`` had other ``@handle_urls`` annotations outside - of the packages/modules listed above, then the corresponding - :class:`~.ApplyRule` won't be returned. Unless, they were recursively - imported in some way similar to :func:`~.web_poet.rules.consume_modules`. + If the ``default_registry`` had other ``@handle_urls`` decorators outside of + the packages/modules listed above, then the corresponding :class:`~.ApplyRule` + won't be returned. Unless, they were recursively imported in some way similar + to :func:`~.web_poet.rules.consume_modules`. """ for module in modules: From 9729e8f1b1691af417ebfe4c0719231231fa3ae7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 14 Oct 2022 18:00:58 +0800 Subject: [PATCH 20/46] clean-up CHANGELOG formatting --- CHANGELOG.rst | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b26a043a..1c91b739 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,32 +6,34 @@ TBD --- * New ``ApplyRule`` class created by the ``@handle_urls`` decorator. This is - nearly identical with ``OverrideRule`` except that: + nearly identical with ``OverrideRule`` except: - * It's now accepting a ``to_return`` parameter which signifies the data - container class that the Page Object returns. - * Passing a string to ``for_patterns`` would auto-convert it into - ``url_matcher.Patterns``. - * All arguments are now keyword-only except for ``for_patterns``. + * It's now accepting a ``to_return`` parameter which signifies the data + container class that the Page Object returns. + * Passing a string to ``for_patterns`` would auto-convert it into + ``url_matcher.Patterns``. + * All arguments are now keyword-only except for ``for_patterns``. * Modify the call signature of ``handle_urls``: * New ``to_return`` parameter which signifies the data container class that - the Page Object returns. This behaves exactly the same with ``overrides`` - but it's more consistent with the attribute names of ``ApplyRules`` that - it creates. + the Page Object returns. This behaves exactly the same with the existing + ``overrides`` parameter but it's more consistent with the attribute names + of ``ApplyRule``. * New ``instead_of`` parameter which does the same thing with ``overrides``. * The old ``overrides`` parameter is not required anymore as it's set for deprecation. +* Documentation, test, and warning message improvements. + Deprecations: * The ``overrides`` parameter from ``@handle_urls`` is now deprecated. Use the ``instead_of`` parameter instead. -* ``OverrideRule`` is now deprecated. Use ``ApplyRule`` instead. +* The ``OverrideRule`` class is now deprecated. Use ``ApplyRule`` instead. * The ``from_override_rules`` method of ``PageObjectRegistry`` is now deprecated. Use ``from_apply_rules`` instead. -* The ``web_poet.overrides`` is deprecated. Use ``web_poet.rules`` instead. +* The ``web_poet.overrides`` module is deprecated. Use ``web_poet.rules`` instead. * The ``PageObjectRegistry.get_overrides`` is deprecated. Use ``PageObjectRegistry.get_rules`` instead. * The ``PageObjectRegistry.search_overrides`` is deprecated. From 8efa813ca6cb8a396beafd552d0be01f4ca6a278 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 14 Oct 2022 19:51:57 +0800 Subject: [PATCH 21/46] update CHANGELOG to soften the value of the 'to_return' param --- CHANGELOG.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1c91b739..9915e1d1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,15 +14,14 @@ TBD ``url_matcher.Patterns``. * All arguments are now keyword-only except for ``for_patterns``. -* Modify the call signature of ``handle_urls``: +* Modify the call signature and behavior of ``handle_urls``: - * New ``to_return`` parameter which signifies the data container class that - the Page Object returns. This behaves exactly the same with the existing - ``overrides`` parameter but it's more consistent with the attribute names - of ``ApplyRule``. * New ``instead_of`` parameter which does the same thing with ``overrides``. * The old ``overrides`` parameter is not required anymore as it's set for deprecation. + * It sets a ``to_return`` parameter when creating ``ApplyRule`` based on the + declared item class in subclasses of ``web_poet.ItemPage``. It's also + possible to pass a ``to_return`` parameter on more advanced use cases. * Documentation, test, and warning message improvements. From 52a6f474bd97d1626eaf153241496a7a78c40df6 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 14 Oct 2022 20:17:44 +0800 Subject: [PATCH 22/46] update override docs to change the tone about the 'to_return' parameter --- docs/intro/overrides.rst | 47 ++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 9abbbc39..cb9ce328 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -9,11 +9,12 @@ matching rules is handled by another library called `url-matcher <https://url-matcher.readthedocs.io>`_. Using such rules establishes the core concept of Overrides wherein a developer -could declare that for a given set of URL patterns a specific Page Object must -be used: +could declare that for a given set of URL patterns, a specific Page Object must +be used instead of another Page Object. - * instead of another Page Object, or - * which returns an Item Class instead of some other Item Class. +The :class:`~.ApplyRule` also supports pointing to the item returned by a specific +Page Object if it both matches the URL pattern and the item class specified in the +rule. This enables **web-poet** to be used effectively by other frameworks like `scrapy-poet <https://scrapy-poet.readthedocs.io>`_. @@ -143,36 +144,41 @@ the following: .. code-block:: python - from web_poet import handle_urls, WebPage, Returns + from web_poet import handle_urls, WebPage - class GenericProductPage(WebPage, Returns[Product]): + class GenericProductPage(WebPage[Product]): def to_item(self) -> Product: return Product(product_title=self.css("title::text").get()) - @handle_urls("example.com", to_return=Product) - class ExampleProductPage(WebPage): + @handle_urls("example.com") + class ExampleProductPage(WebPage[Product]): def to_item(self) -> Product: ... # more specific parsing - @handle_urls("anotherexample.com", to_return=Product, exclude="/digital-goods/") - class AnotherExampleProductPage(WebPage): + @handle_urls("anotherexample.com", exclude="/digital-goods/") + class AnotherExampleProductPage(WebPage[Product]): def to_item(self) -> Product: ... # more specific parsing - @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], to_return=Product) - class DualExampleProductPage(WebPage): + @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"]) + class DualExampleProductPage(WebPage[Product]): def to_item(self) -> SimilarProduct: ... # more specific parsing Let's break this example down: - The URL patterns are exactly the same as with the previous code example. - - The ``instead_of`` parameter has been replaced with ``to_return`` which - accepts an Item Class. This means that: + - The ``@handle_urls`` is able to derive the returned Item Class (i.e. + ``Product``) from the respective Page Object. For advanced use cases, you + can pass the ``to_return`` parameter directly to the ``@handle_urls`` + decorator where it replaces any derived values. + - The ``instead_of`` parameter has been removed in lieu of the derived Item + Class from the Page Object which is passed as a ``to_return`` parameter + when creating the :class:`~.ApplyRule`. This means that: - If a ``Product`` Item Class is requested for URLs matching with the "example.com" pattern, then the ``Product`` Item Class would come from @@ -208,10 +214,10 @@ either contexts of Page Objects and Item Classes. .. code-block:: python - from web_poet import handle_urls, WebPage, Returns + from web_poet import handle_urls, WebPage - class GenericProductPage(WebPage, Returns[Product]): + class GenericProductPage(WebPage[Product]): def to_item(self) -> Product: return Product(product_title=self.css("title::text").get()) @@ -223,21 +229,16 @@ either contexts of Page Objects and Item Classes. @handle_urls("anotherexample.com", instead_of=GenericProductPage, exclude="/digital-goods/") - class AnotherExampleProductPage(WebPage, Returns[Product]): + class AnotherExampleProductPage(WebPage[Product]): def to_item(self) -> Product: ... # more specific parsing @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], instead_of=GenericProductPage) - class DualExampleProductPage(WebPage, Returns[SimilarProduct]): + class DualExampleProductPage(WebPage[SimilarProduct]): def to_item(self) -> SimilarProduct: ... # more specific parsing -Notice that the ``@handle_urls`` decorator for ``AnotherExampleProductPage`` and -``DualExampleProductPage`` doesn't specify the ``to_return`` parameter at all. -This is because it's able to directly derive the returned Item Class from the -respective ``Returns[Product]`` and ``Returns[SimilarProduct]`` declarations. - See the next :ref:`retrieving-overrides` section observe what are the actual :class:`~.ApplyRule` that were created by the ``@handle_urls`` decorators. From 2cb518b4fcf574ab6fd24e653ac9d1341ae135f1 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevin@zyte.com> Date: Fri, 14 Oct 2022 20:33:09 +0800 Subject: [PATCH 23/46] Apply naming and grammar suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves <adrian@chaves.io> Co-authored-by: Mikhail Korobov <kmike84@gmail.com> --- CHANGELOG.rst | 4 ++-- docs/advanced/fields.rst | 2 +- docs/intro/overrides.rst | 10 +++++----- tests/test_rules.py | 2 +- web_poet/rules.py | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9915e1d1..c6a79560 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -33,9 +33,9 @@ Deprecations: * The ``from_override_rules`` method of ``PageObjectRegistry`` is now deprecated. Use ``from_apply_rules`` instead. * The ``web_poet.overrides`` module is deprecated. Use ``web_poet.rules`` instead. -* The ``PageObjectRegistry.get_overrides`` is deprecated. +* The ``PageObjectRegistry.get_overrides`` method is deprecated. Use ``PageObjectRegistry.get_rules`` instead. -* The ``PageObjectRegistry.search_overrides`` is deprecated. +* The ``PageObjectRegistry.search_overrides`` method is deprecated. Use ``PageObjectRegistry.search_rules`` instead. 0.5.1 (2022-09-23) diff --git a/docs/advanced/fields.rst b/docs/advanced/fields.rst index 77ecf05e..d35be98a 100644 --- a/docs/advanced/fields.rst +++ b/docs/advanced/fields.rst @@ -151,7 +151,7 @@ function when accessing the fields: Now any field can be converted from sync to async, or the other way around, and the code would keep working. -.. _`item-classes`: +.. _item-classes: Item classes ------------ diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index cb9ce328..3aa5faad 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -3,7 +3,7 @@ Overrides ========= -Overrides are simply rules represented by a list of :class:`~.ApplyRule` which +Overrides are rules represented by a list of :class:`~.ApplyRule` which associates which URL patterns a particular Page Object would be used. The URL matching rules is handled by another library called `url-matcher <https://url-matcher.readthedocs.io>`_. @@ -56,7 +56,7 @@ Creating Overrides ------------------ To simplify the code examples in the next few subsections, let's consider that -these Item Classes has been predefined. +these Item Classes has been predefined: .. code-block:: python @@ -108,7 +108,7 @@ Let's take a look at how the following code is structured: The code above declares that: - - Page Objects return ``Product`` and ``SimilarProduct`` item class. Returning + - Page Objects return ``Product`` and ``SimilarProduct`` item classes. Returning item classes is a preferred approach as explained in the :ref:`web-poet-fields` section. - For sites that match the ``example.com`` pattern, ``ExampleProductPage`` @@ -239,7 +239,7 @@ either contexts of Page Objects and Item Classes. def to_item(self) -> SimilarProduct: ... # more specific parsing -See the next :ref:`retrieving-overrides` section observe what are the actual +See the next :ref:`retrieving-overrides` section to observe what are the actual :class:`~.ApplyRule` that were created by the ``@handle_urls`` decorators. @@ -356,7 +356,7 @@ This can be done something like: .. _`intro-rule-subset`: Using only a subset of the available ApplyRules -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Suppose that the use case from the previous section has changed wherein a subset of :class:`~.ApplyRule` would be used. This could be achieved by diff --git a/tests/test_rules.py b/tests/test_rules.py index 2e9745f8..5770bf25 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -171,7 +171,7 @@ def test_registry_get_overrides_deprecation() -> None: rules = default_registry.get_overrides() # It should still work as usual - assert len(rules) == 15 + assert len(rules) == len(default_registry.get_rules()) def test_consume_module_not_existing() -> None: diff --git a/web_poet/rules.py b/web_poet/rules.py index 56e148ff..94cee5df 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -72,7 +72,7 @@ class PageObjectRegistry(dict): """This contains the :class:`~.ApplyRule` that associates the Page Objects alongside its Items for a given URL matching rule. - Note that it's simply a ``dict`` subclass with added functionalities on + PageObjectRegistry is a ``dict`` subclass with added functionalities on storing, retrieving, and searching for the :class:`~.ApplyRule` instances. The **value** represents the :class:`~.ApplyRule` instance from which the Page Object in the **key** is allowed to be used. Since it's essentially a @@ -214,7 +214,7 @@ def get_rules(self) -> List[ApplyRule]: """Returns all the :class:`~.ApplyRule` that were declared using the ``@handle_urls`` decorator. - .. warning:: + .. note:: Remember to consider calling :func:`~.web_poet.rules.consume_modules` beforehand to recursively import all submodules which contains the @@ -229,7 +229,7 @@ def get_overrides(self) -> List[ApplyRule]: return self.get_rules() def search_rules(self, **kwargs) -> List[ApplyRule]: - """Returns any :class:`ApplyRule` that has any of its attributes + """Return any :class:`ApplyRule` that has any of its attributes match the rules inside the registry. Sample usage: From 88c511d51cfdf64f1b4ae69563f7a97f8cf61cff Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 14 Oct 2022 20:49:31 +0800 Subject: [PATCH 24/46] test improvements --- pyproject.toml | 2 ++ tests/po_lib_to_return/__init__.py | 6 +++--- tests/test_fields.py | 4 ++-- tests/test_rules.py | 29 ++++++++++++++++++----------- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6cf3bce2..efea5ff0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,4 +9,6 @@ no_warn_no_return = true [[tool.mypy.overrides]] module = "tests.po_lib_to_return.*" +# Ignore mypy errors since the Page Objects contain arbitrary reference values +# used for assertions which have varying types. This upsets mypy. ignore_errors = true diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index d0c40eea..eb05241a 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -22,7 +22,7 @@ class ProductMoreFields(Product): @attrs.define -class ProductLessFields: +class ProductFewerFields: name: str @@ -102,7 +102,7 @@ def brand(self) -> str: @handle_urls("example.com", instead_of=ProductPage) class LessProductPage( - ProductPage, Returns[ProductLessFields], skip_nonitem_fields=True + ProductPage, Returns[ProductFewerFields], skip_nonitem_fields=True ): """A custom PO inheriting from a base PO returning less items using a different item class. @@ -110,7 +110,7 @@ class LessProductPage( expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) - expected_to_return = ProductLessFields + expected_to_return = ProductFewerFields expected_meta = {} @field diff --git a/tests/test_fields.py b/tests/test_fields.py index 38086129..ccd3b149 100644 --- a/tests/test_fields.py +++ b/tests/test_fields.py @@ -12,7 +12,7 @@ LessProductPage, MoreProductPage, Product, - ProductLessFields, + ProductFewerFields, ProductMoreFields, ProductPage, ProductSimilar, @@ -412,7 +412,7 @@ async def test_field_with_handle_urls() -> None: page = LessProductPage() assert page.name == "name" - assert await page.to_item() == ProductLessFields(name="name") + assert await page.to_item() == ProductFewerFields(name="name") for page in [ # type: ignore[assignment] CustomProductPage(), diff --git a/tests/test_rules.py b/tests/test_rules.py index 5770bf25..ea816a26 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -1,5 +1,3 @@ -import warnings - import attrs import pytest from url_matcher import Patterns @@ -190,14 +188,27 @@ def test_list_page_objects_all_consume() -> None: def test_registry_search_rules() -> None: + # param: use rules = default_registry.search_rules(use=POTopLevel2) assert len(rules) == 1 assert rules[0].use == POTopLevel2 + # param: instead_of rules = default_registry.search_rules(instead_of=POTopLevelOverriden2) assert len(rules) == 1 assert rules[0].instead_of == POTopLevelOverriden2 + # param: to_return + rules = default_registry.search_rules(to_return=Product) + assert len(rules) == 3 + assert all([r for r in rules if r.use == Product]) + + # params: to_return and use + rules = default_registry.search_rules(to_return=Product, use=ImprovedProductPage) + assert len(rules) == 1 + assert rules[0].to_return == Product + assert rules[0].use == ImprovedProductPage + # Such rules doesn't exist rules = default_registry.search_rules(use=POModuleOverriden) assert len(rules) == 0 @@ -249,20 +260,16 @@ def test_from_override_rules_deprecation() -> None: def test_handle_urls_deprecation() -> None: - - with warnings.catch_warnings(record=True) as w: + msg = ( + "The 'overrides' parameter in @handle_urls is deprecated. Use the " + "'instead_of' parameter." + ) + with pytest.warns(DeprecationWarning, match=msg): @handle_urls("example.com", overrides=CustomProductPage) class PageWithDeprecatedOverrides: ... - w = [x for x in w if x.category is DeprecationWarning] - assert len(w) == 1 - assert str(w[0].message) == ( - "The 'overrides' parameter in @handle_urls is deprecated. Use the " - "'instead_of' parameter." - ) - def test_override_rule_deprecation() -> None: msg = ( From 928188e2c1267654ff12492e129cc6ecf0a7e251 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 14 Oct 2022 21:08:44 +0800 Subject: [PATCH 25/46] remove 'preferred' param of get_item_cls() --- web_poet/_typing.py | 4 +--- web_poet/rules.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/web_poet/_typing.py b/web_poet/_typing.py index 9455ea28..9f2578c3 100644 --- a/web_poet/_typing.py +++ b/web_poet/_typing.py @@ -24,9 +24,7 @@ def get_generic_parameter(cls): return args[0] -def get_item_cls(cls, default=None, preferred=None): - if preferred: - return preferred +def get_item_cls(cls, default=None): param = get_generic_parameter(cls) if param is None or isinstance(param, typing.TypeVar): # class is not parametrized return default diff --git a/web_poet/rules.py b/web_poet/rules.py index 94cee5df..e0422950 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -192,7 +192,7 @@ def wrapper(cls): ), use=cls, instead_of=instead_of or overrides, - to_return=get_item_cls(cls, preferred=to_return), + to_return=to_return or get_item_cls(cls), meta=kwargs, ) # If it was already defined, we don't want to override it From 3679b59720e53fa74b7f58568310930b75418ab7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 14 Oct 2022 21:22:38 +0800 Subject: [PATCH 26/46] update default behavior of @handle_urls to return dict instead of None --- tests/po_lib/__init__.py | 6 +++--- tests/po_lib/a_module.py | 2 +- tests/po_lib/nested_package/__init__.py | 2 +- tests/po_lib/nested_package/a_nested_module.py | 2 +- tests/po_lib_sub/__init__.py | 2 +- tests/po_lib_to_return/__init__.py | 2 +- web_poet/_typing.py | 2 +- web_poet/pages.py | 2 +- web_poet/rules.py | 5 ++--- 9 files changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 04f15a32..4a4a7f17 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -14,7 +14,7 @@ class POBase(ItemPage): expected_instead_of: Type[ItemPage] expected_patterns: Patterns - expected_to_return = None + expected_to_return: Any = None expected_meta: Dict[str, Any] @@ -35,7 +35,7 @@ class POTopLevelOverriden2(ItemPage): class POTopLevel1(POBase): expected_instead_of = POTopLevelOverriden1 expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) - expected_to_return = None + expected_to_return = dict expected_meta = {} # type: ignore @@ -43,5 +43,5 @@ class POTopLevel1(POBase): class POTopLevel2(POBase): expected_instead_of = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) - expected_to_return = None + expected_to_return = dict expected_meta = {} # type: ignore diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py index 941f48f6..474bc3f0 100644 --- a/tests/po_lib/a_module.py +++ b/tests/po_lib/a_module.py @@ -12,5 +12,5 @@ class POModuleOverriden(ItemPage): class POModule(POBase): expected_instead_of = POModuleOverriden expected_patterns = Patterns(["example.com"]) - expected_to_return = None + expected_to_return = dict expected_meta = {"extra_arg": "foo"} # type: ignore diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py index 63547ea4..1c722f46 100644 --- a/tests/po_lib/nested_package/__init__.py +++ b/tests/po_lib/nested_package/__init__.py @@ -16,5 +16,5 @@ class PONestedPkgOverriden(ItemPage): class PONestedPkg(POBase): expected_instead_of = PONestedPkgOverriden expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) - expected_to_return = None + expected_to_return = dict expected_meta = {} # type: ignore diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py index 5d44b39c..c7473761 100644 --- a/tests/po_lib/nested_package/a_nested_module.py +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -18,5 +18,5 @@ class PONestedModule(POBase): expected_patterns = Patterns( include=["example.com", "example.org"], exclude=["/*.jpg|"] ) - expected_to_return = None + expected_to_return = dict expected_meta = {} # type: ignore diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py index 4889ef38..9b6278a0 100644 --- a/tests/po_lib_sub/__init__.py +++ b/tests/po_lib_sub/__init__.py @@ -22,5 +22,5 @@ class POLibSubOverriden(ItemPage): class POLibSub(POBase): expected_instead_of = POLibSubOverriden expected_patterns = Patterns(["sub_example.com"]) - expected_to_return = None + expected_to_return = dict expected_meta = {} # type: ignore diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index eb05241a..e9457337 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -32,7 +32,7 @@ class SomePage(ItemPage): expected_instead_of = None expected_patterns = Patterns(["example.com"]) - expected_to_return = None + expected_to_return = dict expected_meta = {} @field diff --git a/web_poet/_typing.py b/web_poet/_typing.py index 9f2578c3..b69e4d8f 100644 --- a/web_poet/_typing.py +++ b/web_poet/_typing.py @@ -24,7 +24,7 @@ def get_generic_parameter(cls): return args[0] -def get_item_cls(cls, default=None): +def get_item_cls(cls, default=dict): param = get_generic_parameter(cls) if param is None or isinstance(param, typing.TypeVar): # class is not parametrized return default diff --git a/web_poet/pages.py b/web_poet/pages.py index 77b39af3..d0944f03 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -47,7 +47,7 @@ class Returns(typing.Generic[ItemT]): @property def item_cls(self) -> typing.Type[ItemT]: """Item class""" - return get_item_cls(self.__class__, default=dict) + return get_item_cls(self.__class__) class ItemPage(Injectable, Returns[ItemT]): diff --git a/web_poet/rules.py b/web_poet/rules.py index e0422950..1c0e799e 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -40,9 +40,8 @@ class ApplyRule: pattern represented by the ``for_patterns`` attribute is matched. * ``instead_of`` - *(optional)* The Page Object that will be **replaced** with the Page Object specified via the ``use`` parameter. - * ``to_return`` - *(optional)* The data container class that will be - **replaced** by the item returned by the Page Object specified via the - ``use`` parameter. + * ``to_return`` - *(optional)* The Item class that marks the Page Object + to be **used** which is capable of returning that Item Class. * ``meta`` - *(optional)* Any other information you may want to store. This doesn't do anything for now but may be useful for future API updates. From fc0ba50610cfc78c9c463e7eab52b20a365c3985 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 14 Oct 2022 21:57:32 +0800 Subject: [PATCH 27/46] improve the docstring of handle_urls() --- docs/intro/overrides.rst | 2 ++ web_poet/rules.py | 32 +++++++++++++++++++------------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 3aa5faad..a95519a9 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -134,6 +134,8 @@ The code above declares that: further customized. You can read some of the specific parameters in the :ref:`API section <api-overrides>` of :func:`web_poet.handle_urls`. +.. _item-class-example: + Item Class ~~~~~~~~~~ diff --git a/web_poet/rules.py b/web_poet/rules.py index 1c0e799e..24634db6 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -145,19 +145,25 @@ def handle_urls( **kwargs, ): """ - Class decorator that indicates that the decorated Page Object should be - used instead of the overridden one for a particular set the URLs. - - The Page Object that is **overridden** is declared using the ``instead_of`` - parameter. - - The Item Class could also be **overridden** using the ``to_return`` - parameter. - - The **override** mechanism only works on certain URLs that match the - ``include`` and ``exclude`` parameters. See the documentation of the - `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more - information about them. + Class decorator that indicates that the decorated Page Object should work + for the given URL patterns. + + The URL patterns are matched using the ``include`` and ``exclude`` + parameters while ``priority`` breaks any ties. See the documentation + of the `url-matcher <https://url-matcher.readthedocs.io/>`_ package for + more information about them. + + This decorator is able to derive the item class returned by the Page + Object (see :ref:`item-class-example` section for some examples). This is + important since it marks what type of item the Page Object is capable of + returning for the given URL patterns. For certain advanced cases, you can + pass a ``to_return`` parameter which replaces any derived values (though + this isn't generally recommended). + + Passing another Page Object into the ``instead_of`` parameter indicates + that the decorated Page Object will be used instead of that for the given + set of URL patterns. This is the concept of **overrides** (see the + :ref:`intro-overrides` section for more info`). Any extra parameters are stored as meta information that can be later used. From 5967e34c4342399477a17635dcd090c7e58361e4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Fri, 14 Oct 2022 21:58:23 +0800 Subject: [PATCH 28/46] update docs by removing tick mark chars in anchors --- docs/advanced/additional-requests.rst | 14 +++++++------- docs/api-reference.rst | 4 ++-- docs/index.rst | 2 +- docs/intro/from-ground-up.rst | 2 +- docs/intro/overrides.rst | 16 ++++++++-------- docs/intro/tutorial.rst | 2 +- docs/license.rst | 2 +- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/advanced/additional-requests.rst b/docs/advanced/additional-requests.rst index c9effa47..76b8c701 100644 --- a/docs/advanced/additional-requests.rst +++ b/docs/advanced/additional-requests.rst @@ -1,4 +1,4 @@ -.. _`advanced-requests`: +.. _advanced-requests: =================== Additional Requests @@ -27,7 +27,7 @@ The key words "MUST”, "MUST NOT”, "REQUIRED”, "SHALL”, "SHALL NOT”, "S "SHOULD NOT”, "RECOMMENDED”, "MAY”, and "OPTIONAL” in this document are to be interpreted as described in RFC `2119 <https://www.ietf.org/rfc/rfc2119.txt>`_. -.. _`httprequest-example`: +.. _httprequest-example: HttpRequest =========== @@ -271,7 +271,7 @@ The key take aways for this example are: available. -.. _`httpclient`: +.. _httpclient: HttpClient ========== @@ -337,7 +337,7 @@ additional requests using the :meth:`~.HttpClient.request`, :meth:`~.HttpClient. and :meth:`~.HttpClient.post` methods of :class:`~.HttpClient`. These already define the :class:`~.HttpRequest` and executes it as well. -.. _`httpclient-get-example`: +.. _httpclient-get-example: A simple ``GET`` request ------------------------ @@ -376,7 +376,7 @@ There are a few things to take note in this example: * There is no need create an instance of :class:`~.HttpRequest` when :meth:`~.HttpClient.get` is used. -.. _`request-post-example`: +.. _request-post-example: A ``POST`` request with `header` and `body` ------------------------------------------- @@ -459,7 +459,7 @@ quick shortcuts for :meth:`~.HttpClient.request`: Thus, apart from the common ``GET`` and ``POST`` HTTP methods, you can use :meth:`~.HttpClient.request` for them (`e.g.` ``HEAD``, ``PUT``, ``DELETE``, etc). -.. _`http-batch-request-example`: +.. _http-batch-request-example: Batch requests -------------- @@ -567,7 +567,7 @@ The key takeaways for this example are: first response from a group of requests as early as possible. However, the order could be shuffled. -.. _`exception-handling`: +.. _exception-handling: Handling Exceptions in Page Objects =================================== diff --git a/docs/api-reference.rst b/docs/api-reference.rst index 02c141d0..a82544c4 100644 --- a/docs/api-reference.rst +++ b/docs/api-reference.rst @@ -1,4 +1,4 @@ -.. _`api-reference`: +.. _api-reference: ============= API Reference @@ -81,7 +81,7 @@ Exceptions :show-inheritance: :members: -.. _`api-overrides`: +.. _api-overrides: Overrides ========= diff --git a/docs/index.rst b/docs/index.rst index 77c852e0..e88157b4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -53,7 +53,7 @@ and the motivation behind ``web-poet``, start with :ref:`from-ground-up`. changelog license -.. _`web-poet`: https://github.com/scrapinghub/web-poet +.. _web-poet: https://github.com/scrapinghub/web-poet .. _Scrapy: https://scrapy.org/ .. _scrapy-poet: https://github.com/scrapinghub/scrapy-poet diff --git a/docs/intro/from-ground-up.rst b/docs/intro/from-ground-up.rst index 9af252d5..406eebcc 100644 --- a/docs/intro/from-ground-up.rst +++ b/docs/intro/from-ground-up.rst @@ -1,4 +1,4 @@ -.. _`from-ground-up`: +.. _from-ground-up: =========================== web-poet from the ground up diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index a95519a9..68424c88 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -1,4 +1,4 @@ -.. _`intro-overrides`: +.. _intro-overrides: Overrides ========= @@ -206,7 +206,7 @@ Object. Aside from the item extracted by the Page Object, there might be some other convenience methods or other data from it that you want to access. -.. _`combination`: +.. _combination: Combination ~~~~~~~~~~~ @@ -245,7 +245,7 @@ See the next :ref:`retrieving-overrides` section to observe what are the actual :class:`~.ApplyRule` that were created by the ``@handle_urls`` decorators. -.. _`retrieving-overrides`: +.. _retrieving-overrides: Retrieving all available Overrides ---------------------------------- @@ -303,7 +303,7 @@ Developers have the option to import existing Page Objects alongside the :class:`~.ApplyRule` attached to them. This section aims to showcase different scenarios that come up when using multiple Page Object Projects. -.. _`intro-rule-all`: +.. _intro-rule-all: Using all available ApplyRules from multiple Page Object Projects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -355,7 +355,7 @@ This can be done something like: runtime duration. Calling :func:`~.web_poet.overrides.consume_modules` again makes no difference unless a new set of modules are provided. -.. _`intro-rule-subset`: +.. _intro-rule-subset: Using only a subset of the available ApplyRules ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -395,7 +395,7 @@ conveniently select for :class:`~.ApplyRule` which conform to a specific criteri allows you to conveniently drill down to which :class:`~.ApplyRule` you're interested in using. -.. _`overrides-custom-registry`: +.. _overrides-custom-registry: After gathering all the pre-selected rules, we can then store it in a new instance of :class:`~.PageObjectRegistry` in order to separate it from the ``default_registry`` @@ -409,7 +409,7 @@ for this: my_new_registry = PageObjectRegistry.from_apply_rules(rules) -.. _`intro-improve-po`: +.. _intro-improve-po: Improving on external Page Objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -506,7 +506,7 @@ remained different. There are two main ways we recommend in solving this. -.. _`priority-resolution`: +.. _priority-resolution: **1. Priority Resolution** diff --git a/docs/intro/tutorial.rst b/docs/intro/tutorial.rst index ab30bbbb..2bf72ec4 100644 --- a/docs/intro/tutorial.rst +++ b/docs/intro/tutorial.rst @@ -1,4 +1,4 @@ -.. _`intro-tutorial`: +.. _intro-tutorial: ===================== web-poet on a surface diff --git a/docs/license.rst b/docs/license.rst index e6a41ca8..e647e180 100644 --- a/docs/license.rst +++ b/docs/license.rst @@ -1,4 +1,4 @@ -.. _`license`: +.. _license: ======= License From 0626e578b73843c11ec306a92946ee5d3bf78040 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Mon, 17 Oct 2022 14:13:43 +0800 Subject: [PATCH 29/46] rename some *.com URLs into *.example in docs and tests --- docs/intro/overrides.rst | 126 +++++++++--------- tests/po_lib_sub/__init__.py | 4 +- .../po_lib_sub_not_imported/__init__.py | 4 +- 3 files changed, 67 insertions(+), 67 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 68424c88..92660efa 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -89,19 +89,19 @@ Let's take a look at how the following code is structured: return Product(product_title=self.css("title::text").get()) - @handle_urls("example.com", instead_of=GenericProductPage) + @handle_urls("some.example", instead_of=GenericProductPage) class ExampleProductPage(WebPage): def to_item(self) -> Product: ... # more specific parsing - @handle_urls("anotherexample.com", instead_of=GenericProductPage, exclude="/digital-goods/") + @handle_urls("another.example", instead_of=GenericProductPage, exclude="/digital-goods/") class AnotherExampleProductPage(WebPage): def to_item(self) -> Product: ... # more specific parsing - @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], instead_of=GenericProductPage) + @handle_urls(["dual.example/shop/?product=*", "uk.dual.example/store/?pid=*"], instead_of=GenericProductPage) class DualExampleProductPage(WebPage): def to_item(self) -> SimilarProduct: ... # more specific parsing @@ -111,21 +111,21 @@ The code above declares that: - Page Objects return ``Product`` and ``SimilarProduct`` item classes. Returning item classes is a preferred approach as explained in the :ref:`web-poet-fields` section. - - For sites that match the ``example.com`` pattern, ``ExampleProductPage`` + - For sites that match the ``some.example`` pattern, ``ExampleProductPage`` would be used instead of ``GenericProductPage``. - The same is true for ``DualExampleProductPage`` where it is used instead of ``GenericProductPage`` for two URL patterns which works as something like: - - :sub:`(match) https://www.dualexample.com/shop/electronics/?product=123` - - :sub:`(match) https://www.dualexample.com/shop/books/paperback/?product=849` - - :sub:`(NO match) https://www.dualexample.com/on-sale/books/?product=923` - - :sub:`(match) https://www.dualexample.net/store/kitchen/?pid=776` - - :sub:`(match) https://www.dualexample.net/store/?pid=892` - - :sub:`(NO match) https://www.dualexample.net/new-offers/fitness/?pid=892` + - :sub:`(match) https://www.dual.example/shop/electronics/?product=123` + - :sub:`(match) https://www.dual.example/shop/books/paperback/?product=849` + - :sub:`(NO match) https://www.dual.example/on-sale/books/?product=923` + - :sub:`(match) https://www.uk.dual.example/store/kitchen/?pid=776` + - :sub:`(match) https://www.uk.dual.example/store/?pid=892` + - :sub:`(NO match) https://www.uk.dual.example/new-offers/fitness/?pid=892` - On the other hand, ``AnotherExampleProductPage`` is only used instead of - ``GenericProductPage`` when we're handling pages from ``anotherexample.com`` + ``GenericProductPage`` when we're handling pages from ``another.example`` that doesn't contain ``/digital-goods/`` in its URL path. .. tip:: @@ -154,19 +154,19 @@ the following: return Product(product_title=self.css("title::text").get()) - @handle_urls("example.com") + @handle_urls("some.example") class ExampleProductPage(WebPage[Product]): def to_item(self) -> Product: ... # more specific parsing - @handle_urls("anotherexample.com", exclude="/digital-goods/") + @handle_urls("another.example", exclude="/digital-goods/") class AnotherExampleProductPage(WebPage[Product]): def to_item(self) -> Product: ... # more specific parsing - @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"]) + @handle_urls(["dual.example/shop/?product=*", "uk.dual.example/store/?pid=*"], instead_of=GenericProductPage) class DualExampleProductPage(WebPage[Product]): def to_item(self) -> SimilarProduct: ... # more specific parsing @@ -183,13 +183,13 @@ Let's break this example down: when creating the :class:`~.ApplyRule`. This means that: - If a ``Product`` Item Class is requested for URLs matching with the - "example.com" pattern, then the ``Product`` Item Class would come from + "some.example" pattern, then the ``Product`` Item Class would come from the ``to_item()`` method of ``ExampleProductPage``. - - Similarly, if the URL matches with "anotherexample.com" without the + - Similarly, if the URL matches with "another.example" without the "/digital-goods/" path, then the ``Product`` Item Class comes from the ``AnotherExampleProductPage`` Page Object. - However, if a ``Product`` Item Class is requested matching with the URL - pattern of "dualexample.com/shop/?product=*", a ``SimilarProduct`` + pattern of "dual.example/shop/?product=*", a ``SimilarProduct`` Item Class is returned by the ``DualExampleProductPage``'s ``to_item()`` method instead. @@ -224,19 +224,19 @@ either contexts of Page Objects and Item Classes. return Product(product_title=self.css("title::text").get()) - @handle_urls("example.com", instead_of=GenericProductPage, to_return=Product) + @handle_urls("some.example", instead_of=GenericProductPage, to_return=Product) class ExampleProductPage(WebPage): def to_item(self) -> Product: ... # more specific parsing - @handle_urls("anotherexample.com", instead_of=GenericProductPage, exclude="/digital-goods/") + @handle_urls("another.example", instead_of=GenericProductPage, exclude="/digital-goods/") class AnotherExampleProductPage(WebPage[Product]): def to_item(self) -> Product: ... # more specific parsing - @handle_urls(["dualexample.com/shop/?product=*", "dualexample.net/store/?pid=*"], instead_of=GenericProductPage) + @handle_urls(["dual.example/shop/?product=*", "uk.dual.example/store/?pid=*"], instead_of=GenericProductPage) class DualExampleProductPage(WebPage[SimilarProduct]): def to_item(self) -> SimilarProduct: ... # more specific parsing @@ -264,9 +264,9 @@ would be: for r in rules: print(r) - # ApplyRule(for_patterns=Patterns(include=('example.com',), exclude=(), priority=500), use=<class 'ExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'Product'>, meta={}) - # ApplyRule(for_patterns=Patterns(include=('anotherexample.com',), exclude=('/digital-goods/',), priority=500), use=<class 'AnotherExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'Product'>, meta={}) - # ApplyRule(for_patterns=Patterns(include=('dualexample.com/shop/?product=*', 'dualexample.net/store/?pid=*'), exclude=(), priority=500), use=<class 'DualExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'SimilarProduct'>, meta={}) + # ApplyRule(for_patterns=Patterns(include=('some.example',), exclude=(), priority=500), use=<class 'ExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'Product'>, meta={}) + # ApplyRule(for_patterns=Patterns(include=('another.example',), exclude=('/digital-goods/',), priority=500), use=<class 'AnotherExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'Product'>, meta={}) + # ApplyRule(for_patterns=Patterns(include=('dual.example/shop/?product=*', 'uk.dual.example/store/?pid=*'), exclude=(), priority=500), use=<class 'DualExampleProductPage'>, instead_of=<class 'GenericProductPage'>, to_return=<class 'SimilarProduct'>, meta={}) Remember that using ``@handle_urls`` to annotate the Page Objects would result in the :class:`~.ApplyRule` to be written into ``web_poet.default_registry``. @@ -343,10 +343,10 @@ This can be done something like: # The collected rules would then be as follows: print(rules) - # 1. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) - # 4. ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # 1. ApplyRule(for_patterns=Patterns(include=['site_1.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # 4. ApplyRule(for_patterns=Patterns(include=['site_3.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) .. note:: @@ -377,18 +377,18 @@ Here's an example of how you could manually select the rules using the ecom_rules = default_registry.search_rules(instead_of=ecommerce_page_objects.EcomGenericPage) print(ecom_rules) - # ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_1.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) gadget_rules = default_registry.search_rules(use=gadget_sites_page_objects.site_3.GadgetSite3) print(gadget_rules) - # ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_3.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) rules = ecom_rules + gadget_rules print(rules) - # ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_1.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_3.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) As you can see, using the :meth:`~.PageObjectRegistry.search_rules` method allows you to conveniently select for :class:`~.ApplyRule` which conform to a specific criteria. This @@ -435,28 +435,28 @@ have the first approach as an example: # The collected rules would then be as follows: print(rules) - # 1. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) - # 4. ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # 1. ApplyRule(for_patterns=Patterns(include=['site_1.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # 4. ApplyRule(for_patterns=Patterns(include=['site_3.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) - @handle_urls("site_1.com", instead_of=ecommerce_page_objects.EcomGenericPage, priority=1000) + @handle_urls("site_1.example", instead_of=ecommerce_page_objects.EcomGenericPage, priority=1000) class ImprovedEcomSite1(ecommerce_page_objects.site_1.EcomSite1): def to_item(self): ... # call super().to_item() and improve on the item's shortcomings rules = default_registry.get_rules() print(rules) - # 1. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) - # 4. ApplyRule(for_patterns=Patterns(include=['site_3.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) - # 5. ApplyRule(for_patterns=Patterns(include=['site_1.com'], exclude=[], priority=1000), use=<class 'my_project.ImprovedEcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # 1. ApplyRule(for_patterns=Patterns(include=['site_1.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_1.EcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # 4. ApplyRule(for_patterns=Patterns(include=['site_3.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_3.GadgetSite3'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # 5. ApplyRule(for_patterns=Patterns(include=['site_1.example'], exclude=[], priority=1000), use=<class 'my_project.ImprovedEcomSite1'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) Notice that we're adding a new :class:`~.ApplyRule` for the same URL pattern -for ``site_1.com``. +for ``site_1.example``. -When the time comes that a Page Object needs to be selected when parsing ``site_1.com`` +When the time comes that a Page Object needs to be selected when parsing ``site_1.example`` and it needs to replace ``ecommerce_page_objects.EcomGenericPage``, rules **#1** and **#5** will be the choices. However, since we've assigned a much **higher priority** for the new rule in **#5** than the default ``500`` value, rule **#5** will be @@ -478,18 +478,18 @@ available rules: .. code-block:: python - # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) - # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'ecommerce_page_objects.EcomGenericPage'>, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'gadget_sites_page_objects.GadgetGenericPage'>, to_return=None, meta={}) However, it's technically **NOT** a `conflict`, **yet**, since: - - ``ecommerce_page_objects.site_2.EcomSite2`` would only be used in **site_2.com** + - ``ecommerce_page_objects.site_2.EcomSite2`` would only be used in **site_2.example** if ``ecommerce_page_objects.EcomGenericPage`` is to be replaced. - The same case with ``gadget_sites_page_objects.site_2.GadgetSite2`` wherein - it's only going to be utilized for **site_2.com** if the following is to be + it's only going to be utilized for **site_2.example** if the following is to be replaced: ``gadget_sites_page_objects.GadgetGenericPage``. -It would be only become a conflict if both rules for **site_2.com** `intend to +It would be only become a conflict if both rules for **site_2.example** `intend to replace the` **same** `Page Object`. However, let's suppose that there are some :class:`~.ApplyRule` which actually @@ -498,8 +498,8 @@ result in a conflict. To give an example, let's suppose that rules **#2** and ** .. code-block:: python - # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) - # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) Notice that the ``instead_of`` param are the same and only the ``use`` param remained different. @@ -545,7 +545,7 @@ Here's an example: from web_poet import default_registry, consume_modules, handle_urls import ecommerce_page_objects, gadget_sites_page_objects, common_items - @handle_urls("site_2.com", instead_of=common_items.ProductGenericPage, priority=1000) + @handle_urls("site_2.example", instead_of=common_items.ProductGenericPage, priority=1000) class EcomSite2Copy(ecommerce_page_objects.site_1.EcomSite1): def to_item(self): return super().to_item() @@ -555,10 +555,10 @@ the new :class:`~.ApplyRule` having a much higher priority (see rule **#4**): .. code-block:: python - # 2. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) - # 3. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) + # 2. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'ecommerce_page_objects.site_2.EcomSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) + # 3. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=500), use=<class 'gadget_sites_page_objects.site_2.GadgetSite2'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) - # 4. ApplyRule(for_patterns=Patterns(include=['site_2.com'], exclude=[], priority=1000), use=<class 'my_project.EcomSite2Copy'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) + # 4. ApplyRule(for_patterns=Patterns(include=['site_2.example'], exclude=[], priority=1000), use=<class 'my_project.EcomSite2Copy'>, instead_of=<class 'common_items.ProductGenericPage'>, to_return=None, meta={}) A similar idea was also discussed in the :ref:`intro-improve-po` section. @@ -591,9 +591,9 @@ easily find the Page Object's rule using its `key`. Here's an example: consume_modules("package_A", "package_B", "package_C") rules = [ - default_registry[package_A.PageObject1], # ApplyRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=<class 'package_A.PageObject1'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}) - default_registry[package_B.PageObject2], # ApplyRule(for_patterns=Patterns(include=['site_B.com'], exclude=[], priority=500), use=<class 'package_B.PageObject2'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}) - default_registry[package_C.PageObject3], # ApplyRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=<class 'package_C.PageObject3'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}) + default_registry[package_A.PageObject1], # ApplyRule(for_patterns=Patterns(include=['site_A.example'], exclude=[], priority=500), use=<class 'package_A.PageObject1'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}) + default_registry[package_B.PageObject2], # ApplyRule(for_patterns=Patterns(include=['site_B.example'], exclude=[], priority=500), use=<class 'package_B.PageObject2'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}) + default_registry[package_C.PageObject3], # ApplyRule(for_patterns=Patterns(include=['site_C.example'], exclude=[], priority=500), use=<class 'package_C.PageObject3'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}) ] Another approach would be using the :meth:`~.PageObjectRegistry.search_rules` @@ -614,17 +614,17 @@ Here's an example: rule_from_A = default_registry.search_rules(use=package_A.PageObject1) print(rule_from_A) - # [ApplyRule(for_patterns=Patterns(include=['site_A.com'], exclude=[], priority=500), use=<class 'package_A.PageObject1'>, instead_of=<class 'GenericPage'>, to_return=None, meta={})] + # [ApplyRule(for_patterns=Patterns(include=['site_A.example'], exclude=[], priority=500), use=<class 'package_A.PageObject1'>, instead_of=<class 'GenericPage'>, to_return=None, meta={})] rule_from_B = default_registry.search_rules(instead_of=GenericProductPage) print(rule_from_B) # [] - rule_from_C = default_registry.search_rules(for_patterns=Patterns(include=["site_C.com"])) + rule_from_C = default_registry.search_rules(for_patterns=Patterns(include=["site_C.example"])) print(rule_from_C) # [ - # ApplyRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=500), use=<class 'package_C.PageObject3'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}), - # ApplyRule(for_patterns=Patterns(include=['site_C.com'], exclude=[], priority=1000), use=<class 'package_C.PageObject3_improved'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}) + # ApplyRule(for_patterns=Patterns(include=['site_C.example'], exclude=[], priority=500), use=<class 'package_C.PageObject3'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}), + # ApplyRule(for_patterns=Patterns(include=['site_C.example'], exclude=[], priority=1000), use=<class 'package_C.PageObject3_improved'>, instead_of=<class 'GenericPage'>, to_return=None, meta={}) # ] rules = rule_from_A + rule_from_B + rule_from_C diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py index 9b6278a0..7b367239 100644 --- a/tests/po_lib_sub/__init__.py +++ b/tests/po_lib_sub/__init__.py @@ -18,9 +18,9 @@ class POLibSubOverriden(ItemPage): ... -@handle_urls("sub_example.com", instead_of=POLibSubOverriden) +@handle_urls("sub.example", instead_of=POLibSubOverriden) class POLibSub(POBase): expected_instead_of = POLibSubOverriden - expected_patterns = Patterns(["sub_example.com"]) + expected_patterns = Patterns(["sub.example"]) expected_to_return = dict expected_meta = {} # type: ignore diff --git a/tests_extra/po_lib_sub_not_imported/__init__.py b/tests_extra/po_lib_sub_not_imported/__init__.py index e803b29f..0327f4ef 100644 --- a/tests_extra/po_lib_sub_not_imported/__init__.py +++ b/tests_extra/po_lib_sub_not_imported/__init__.py @@ -21,9 +21,9 @@ class POLibSubOverridenNotImported: ... -@handle_urls("sub_example_not_imported.com", instead_of=POLibSubOverridenNotImported) +@handle_urls("sub_not_imported.example", instead_of=POLibSubOverridenNotImported) class POLibSubNotImported(POBase): expected_instead_of = POLibSubOverridenNotImported - expected_patterns = Patterns(["sub_example_not_imported.com"]) + expected_patterns = Patterns(["sub_not_imported.example"]) expected_to_return = None expected_meta = {} # type: ignore From de15a86b8497b1dd40b29668a184409cc80f7177 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Tue, 18 Oct 2022 14:56:23 +0800 Subject: [PATCH 30/46] revert default 'to_return=dict' and use 'None' instead --- tests/po_lib/__init__.py | 4 ++-- tests/po_lib/a_module.py | 2 +- tests/po_lib/nested_package/__init__.py | 2 +- tests/po_lib/nested_package/a_nested_module.py | 2 +- tests/po_lib_sub/__init__.py | 2 +- tests/po_lib_to_return/__init__.py | 2 +- web_poet/_typing.py | 2 +- web_poet/pages.py | 2 +- web_poet/rules.py | 9 ++++++++- 9 files changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/po_lib/__init__.py b/tests/po_lib/__init__.py index 4a4a7f17..91ae0036 100644 --- a/tests/po_lib/__init__.py +++ b/tests/po_lib/__init__.py @@ -35,7 +35,7 @@ class POTopLevelOverriden2(ItemPage): class POTopLevel1(POBase): expected_instead_of = POTopLevelOverriden1 expected_patterns = Patterns(["example.com"], ["/*.jpg|"], priority=300) - expected_to_return = dict + expected_to_return = None expected_meta = {} # type: ignore @@ -43,5 +43,5 @@ class POTopLevel1(POBase): class POTopLevel2(POBase): expected_instead_of = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) - expected_to_return = dict + expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib/a_module.py b/tests/po_lib/a_module.py index 474bc3f0..941f48f6 100644 --- a/tests/po_lib/a_module.py +++ b/tests/po_lib/a_module.py @@ -12,5 +12,5 @@ class POModuleOverriden(ItemPage): class POModule(POBase): expected_instead_of = POModuleOverriden expected_patterns = Patterns(["example.com"]) - expected_to_return = dict + expected_to_return = None expected_meta = {"extra_arg": "foo"} # type: ignore diff --git a/tests/po_lib/nested_package/__init__.py b/tests/po_lib/nested_package/__init__.py index 1c722f46..63547ea4 100644 --- a/tests/po_lib/nested_package/__init__.py +++ b/tests/po_lib/nested_package/__init__.py @@ -16,5 +16,5 @@ class PONestedPkgOverriden(ItemPage): class PONestedPkg(POBase): expected_instead_of = PONestedPkgOverriden expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) - expected_to_return = dict + expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib/nested_package/a_nested_module.py b/tests/po_lib/nested_package/a_nested_module.py index c7473761..5d44b39c 100644 --- a/tests/po_lib/nested_package/a_nested_module.py +++ b/tests/po_lib/nested_package/a_nested_module.py @@ -18,5 +18,5 @@ class PONestedModule(POBase): expected_patterns = Patterns( include=["example.com", "example.org"], exclude=["/*.jpg|"] ) - expected_to_return = dict + expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib_sub/__init__.py b/tests/po_lib_sub/__init__.py index 7b367239..50ffd99b 100644 --- a/tests/po_lib_sub/__init__.py +++ b/tests/po_lib_sub/__init__.py @@ -22,5 +22,5 @@ class POLibSubOverriden(ItemPage): class POLibSub(POBase): expected_instead_of = POLibSubOverriden expected_patterns = Patterns(["sub.example"]) - expected_to_return = dict + expected_to_return = None expected_meta = {} # type: ignore diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index e9457337..eb05241a 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -32,7 +32,7 @@ class SomePage(ItemPage): expected_instead_of = None expected_patterns = Patterns(["example.com"]) - expected_to_return = dict + expected_to_return = None expected_meta = {} @field diff --git a/web_poet/_typing.py b/web_poet/_typing.py index b69e4d8f..9f2578c3 100644 --- a/web_poet/_typing.py +++ b/web_poet/_typing.py @@ -24,7 +24,7 @@ def get_generic_parameter(cls): return args[0] -def get_item_cls(cls, default=dict): +def get_item_cls(cls, default=None): param = get_generic_parameter(cls) if param is None or isinstance(param, typing.TypeVar): # class is not parametrized return default diff --git a/web_poet/pages.py b/web_poet/pages.py index d0944f03..77b39af3 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -47,7 +47,7 @@ class Returns(typing.Generic[ItemT]): @property def item_cls(self) -> typing.Type[ItemT]: """Item class""" - return get_item_cls(self.__class__) + return get_item_cls(self.__class__, default=dict) class ItemPage(Injectable, Returns[ItemT]): diff --git a/web_poet/rules.py b/web_poet/rules.py index 24634db6..fdd5dbd0 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -46,9 +46,16 @@ class ApplyRule: This doesn't do anything for now but may be useful for future API updates. The main functionality of this class lies in the ``instead_of`` and ``to_return`` - parameter. Should both of these be omitted, then :class:`~.ApplyRule` simply + parameters. Should both of these be omitted, then :class:`~.ApplyRule` simply tags which URL patterns the given Page Object is expected to be used. + As much as possible, the ``to_return`` parameter should capture the Item Class + that the Page Object is capable of returning. Before passing it to :class:`~.ApplyRule`, + the ``to_return`` value is primarily derived from the return class specified + from Page Objects that are subclasses of :class:`~.ItemPage`. However, a + special case exists when a Page Object returns a ``dict`` as an item but then + the rule should have ``to_return=None`` and **NOT** ``to_return=dict``. + More information regarding its usage in :ref:`intro-overrides`. .. tip:: From f354c4aad1b1c2d3616193c594bef150c39a59a7 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Tue, 18 Oct 2022 18:17:00 +0800 Subject: [PATCH 31/46] improve docs and code comments --- docs/advanced/fields.rst | 22 +++---- docs/intro/overrides.rst | 92 ++++++++++++++++++------------ tests/po_lib_to_return/__init__.py | 8 +-- tests/test_rules.py | 2 +- web_poet/pages.py | 2 +- web_poet/rules.py | 6 +- 6 files changed, 76 insertions(+), 56 deletions(-) diff --git a/docs/advanced/fields.rst b/docs/advanced/fields.rst index 3b499a2b..25467819 100644 --- a/docs/advanced/fields.rst +++ b/docs/advanced/fields.rst @@ -185,7 +185,7 @@ Item classes ------------ In all previous examples, ``to_item`` methods are returning ``dict`` -instances. It is common to use item classes (e.g. dataclasses or +instances. It is common to use Item Classes (e.g. dataclasses or attrs instances) instead of unstructured dicts to hold the data: .. code-block:: python @@ -209,7 +209,7 @@ attrs instances) instead of unstructured dicts to hold the data: ) :mod:`web_poet.fields` supports it, by allowing to parametrize -:class:`~.ItemPage` with an item class: +:class:`~.ItemPage` with an Item Class: .. code-block:: python @@ -217,13 +217,13 @@ attrs instances) instead of unstructured dicts to hold the data: class ProductPage(ItemPage[Product]): # ... -When :class:`~.ItemPage` is parametrized with an item class, +When :class:`~.ItemPage` is parametrized with an Item Class, its ``to_item()`` method starts to return item instances, instead of ``dict`` instances. In the example above ``ProductPage.to_item`` method returns ``Product`` instances. -Defining an Item class may be an overkill if you only have a single Page Object, -but item classes are of a great help when +Defining an Item Class may be an overkill if you only have a single Page Object, +but Item Classes are of a great help when * you need to extract data in the same format from multiple websites, or * if you want to define the schema upfront. @@ -231,7 +231,7 @@ but item classes are of a great help when Error prevention ~~~~~~~~~~~~~~~~ -Item classes play particularly well with the +Item Classes play particularly well with the :func:`@field <web_poet.fields.field>` decorator, preventing some of the errors, which may happen if results are plain "dicts". @@ -256,7 +256,7 @@ Consider the following badly written page object: def nane(self): return self.response.css(".name").get() -Because the ``Product`` item class is used, a typo ("nane" instead of "name") +Because the ``Product`` Item Class is used, a typo ("nane" instead of "name") is detected at runtime: the creation of a ``Product`` instance would fail with a ``TypeError``, because of the unexpected keyword argument "nane". @@ -265,7 +265,7 @@ detected: the ``price`` argument is required, but there is no extraction method this attribute, so ``Product.__init__`` will raise another ``TypeError``, indicating that a required argument is missing. -Without an item class, none of these errors are detected. +Without an Item Class, none of these errors are detected. Changing Item Class ~~~~~~~~~~~~~~~~~~~ @@ -335,7 +335,7 @@ to the item: # ... Note how :class:`~.Returns` is used as one of the base classes of -``CustomFooPage``; it allows to change the item class returned by a page object. +``CustomFooPage``; it allows to change the Item Class returned by a page object. Removing fields (as well as renaming) is a bit more tricky. @@ -370,13 +370,13 @@ is passed, and ``name`` is the only field ``CustomItem`` supports. To recap: -* Use ``Returns[NewItemType]`` to change the item class in a subclass. +* Use ``Returns[NewItemType]`` to change the Item Class in a subclass. * Don't use ``skip_nonitem_fields=True`` when your Page Object corresponds to an item exactly, or when you're only adding fields. This is a safe approach, which allows to detect typos in field names, even for optional fields. * Use ``skip_nonitem_fields=True`` when it's possible for the Page Object - to contain more ``@fields`` than defined in the item class, e.g. because + to contain more ``@fields`` than defined in the Item Class, e.g. because Page Object is inherited from some other base Page Object. Caching diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 92660efa..452a4f7d 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -13,7 +13,7 @@ could declare that for a given set of URL patterns, a specific Page Object must be used instead of another Page Object. The :class:`~.ApplyRule` also supports pointing to the item returned by a specific -Page Object if it both matches the URL pattern and the item class specified in the +Page Object if it both matches the URL pattern and the Item Class specified in the rule. This enables **web-poet** to be used effectively by other frameworks like @@ -55,8 +55,8 @@ Let's see this in action by declaring the Overrides in the Page Objects below. Creating Overrides ------------------ -To simplify the code examples in the next few subsections, let's consider that -these Item Classes has been predefined: +To simplify the code examples in the next few subsections, let's assume that +these Item Classes have been predefined: .. code-block:: python @@ -91,26 +91,23 @@ Let's take a look at how the following code is structured: @handle_urls("some.example", instead_of=GenericProductPage) class ExampleProductPage(WebPage): - def to_item(self) -> Product: - ... # more specific parsing + ... # more specific parsing @handle_urls("another.example", instead_of=GenericProductPage, exclude="/digital-goods/") class AnotherExampleProductPage(WebPage): - def to_item(self) -> Product: - ... # more specific parsing + ... # more specific parsing @handle_urls(["dual.example/shop/?product=*", "uk.dual.example/store/?pid=*"], instead_of=GenericProductPage) class DualExampleProductPage(WebPage): - def to_item(self) -> SimilarProduct: - ... # more specific parsing + ... # more specific parsing The code above declares that: - - Page Objects return ``Product`` and ``SimilarProduct`` item classes. Returning - item classes is a preferred approach as explained in the :ref:`web-poet-fields` - section. + - The Page Objects return ``Product`` and ``SimilarProduct`` Item Classes. + Returning Item Classes is a preferred approach as explained in the + :ref:`web-poet-fields` section. - For sites that match the ``some.example`` pattern, ``ExampleProductPage`` would be used instead of ``GenericProductPage``. - The same is true for ``DualExampleProductPage`` where it is used @@ -124,9 +121,10 @@ The code above declares that: - :sub:`(match) https://www.uk.dual.example/store/?pid=892` - :sub:`(NO match) https://www.uk.dual.example/new-offers/fitness/?pid=892` - - On the other hand, ``AnotherExampleProductPage`` is only used instead of - ``GenericProductPage`` when we're handling pages from ``another.example`` - that doesn't contain ``/digital-goods/`` in its URL path. + - On the other hand, ``AnotherExampleProductPage`` is used instead of + ``GenericProductPage`` when we're handling pages that match with the + ``another.example`` URL Pattern which doesn't contain ``/digital-goods/`` + in its URL path. .. tip:: @@ -156,20 +154,17 @@ the following: @handle_urls("some.example") class ExampleProductPage(WebPage[Product]): - def to_item(self) -> Product: - ... # more specific parsing + ... # more specific parsing @handle_urls("another.example", exclude="/digital-goods/") class AnotherExampleProductPage(WebPage[Product]): - def to_item(self) -> Product: - ... # more specific parsing + ... # more specific parsing - @handle_urls(["dual.example/shop/?product=*", "uk.dual.example/store/?pid=*"], instead_of=GenericProductPage) + @handle_urls(["dual.example/shop/?product=*", "uk.dual.example/store/?pid=*"]) class DualExampleProductPage(WebPage[Product]): - def to_item(self) -> SimilarProduct: - ... # more specific parsing + ... # more specific parsing Let's break this example down: @@ -178,24 +173,24 @@ Let's break this example down: ``Product``) from the respective Page Object. For advanced use cases, you can pass the ``to_return`` parameter directly to the ``@handle_urls`` decorator where it replaces any derived values. - - The ``instead_of`` parameter has been removed in lieu of the derived Item - Class from the Page Object which is passed as a ``to_return`` parameter - when creating the :class:`~.ApplyRule`. This means that: + - The ``instead_of`` parameter can be omitted in lieu of the derived Item + Class from the Page Object which becomes the ``to_return`` attribute in + :class:`~.ApplyRule` instances. This means that: - If a ``Product`` Item Class is requested for URLs matching with the "some.example" pattern, then the ``Product`` Item Class would come from the ``to_item()`` method of ``ExampleProductPage``. - - Similarly, if the URL matches with "another.example" without the - "/digital-goods/" path, then the ``Product`` Item Class comes from the - ``AnotherExampleProductPage`` Page Object. + - Similarly, if a page with a URL matches with "another.example" without + the "/digital-goods/" path, then the ``Product`` Item Class comes from + the ``AnotherExampleProductPage`` Page Object. - However, if a ``Product`` Item Class is requested matching with the URL pattern of "dual.example/shop/?product=*", a ``SimilarProduct`` Item Class is returned by the ``DualExampleProductPage``'s ``to_item()`` method instead. This has the same concept as with the Page Object Overrides but the context changes -from the Page Object into the Item Classes returned by the ``to_item()`` method -instead. +from using a Page Object into Item Classes instead (returned by the ``to_item()`` +method). The upside here is that you're able to directly access the item extracted by the Page Object for the given site. There's no need to directly call the ``to_item()`` @@ -226,20 +221,45 @@ either contexts of Page Objects and Item Classes. @handle_urls("some.example", instead_of=GenericProductPage, to_return=Product) class ExampleProductPage(WebPage): - def to_item(self) -> Product: - ... # more specific parsing + ... # more specific parsing @handle_urls("another.example", instead_of=GenericProductPage, exclude="/digital-goods/") class AnotherExampleProductPage(WebPage[Product]): - def to_item(self) -> Product: - ... # more specific parsing + ... # more specific parsing @handle_urls(["dual.example/shop/?product=*", "uk.dual.example/store/?pid=*"], instead_of=GenericProductPage) class DualExampleProductPage(WebPage[SimilarProduct]): - def to_item(self) -> SimilarProduct: - ... # more specific parsing + ... # more specific parsing + +To recap some previously mentioned behaviors: + + - In the ``@handle_urls`` decorator for ``ExampleProductPage``, we needed to + pass the ``to_return`` parameter since the Page Object didn't specify the + Item Class that it returns (i.e. ``class ExampleProductPage(WebPage[Product])``). + If it did, we could omit the ``to_return`` parameter entirely since it's + able to automatically derive the Item Class. + - The recommended way is to avoid using the ``to_return`` parameter in + ``@handle_urls`` entirely (similar to how ``AnotherExampleProductPage`` + and ``DualExampleProductPage`` does it). + +.. warning:: + + Only use the ``to_return`` parameter in the ``@handle_urls`` decorator for + more advanced use cases since it presents the risk of having the Item Class + returned by the Page Object diverge with the actual ``to_return`` value in + the :class:`~.ApplyRule`. For example: + + .. code-block:: python + + @handle_urls("some.example", to_return=DifferentProduct) + class ProductPage(WebPage[Product]) + ... # more specific parsing + + The Page Object returns a ``Product`` item but the created :class:`~.ApplyRule` + has ``to_return=DifferentProduct``. + See the next :ref:`retrieving-overrides` section to observe what are the actual :class:`~.ApplyRule` that were created by the ``@handle_urls`` decorators. diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index eb05241a..17d2e69a 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -75,7 +75,7 @@ def name(self) -> str: @handle_urls("example.com", instead_of=ProductPage) class SimilarProductPage(ProductPage, Returns[ProductSimilar]): """A custom PO inheriting from a base PO returning the same fields but in - a different item class. + a different Item Class. """ expected_instead_of = ProductPage @@ -87,7 +87,7 @@ class SimilarProductPage(ProductPage, Returns[ProductSimilar]): @handle_urls("example.com", instead_of=ProductPage) class MoreProductPage(ProductPage, Returns[ProductMoreFields]): """A custom PO inheriting from a base PO returning more items using a - different item class. + different Item Class. """ expected_instead_of = ProductPage @@ -105,7 +105,7 @@ class LessProductPage( ProductPage, Returns[ProductFewerFields], skip_nonitem_fields=True ): """A custom PO inheriting from a base PO returning less items using a - different item class. + different Item Class. """ expected_instead_of = ProductPage @@ -121,7 +121,7 @@ def brand(self) -> str: @handle_urls("example.com", instead_of=ProductPage, to_return=ProductSimilar) class CustomProductPage(ProductPage, Returns[Product]): """A custom PO inheriting from a base PO returning the same fields but in - a different item class. + a different Item Class. This PO is the same with ``SimilarProductPage`` but passes a ``to_return`` in the ``@handle_urls`` decorator. diff --git a/tests/test_rules.py b/tests/test_rules.py index ea816a26..dd9826dc 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -81,7 +81,7 @@ def test_apply_rule_uniqueness() -> None: instead_of=POTopLevelOverriden2, to_return=ProductSimilar, ) - # A different item class results in different hash. + # A different Item Class results in different hash. assert hash(rule1) != hash(rule2) diff --git a/web_poet/pages.py b/web_poet/pages.py index 77b39af3..f54e86ba 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -41,7 +41,7 @@ def is_injectable(cls: typing.Any) -> bool: class Returns(typing.Generic[ItemT]): - """Inherit from this generic mixin to change the item class used by + """Inherit from this generic mixin to change the Item Class used by :class:`~.ItemPage`""" @property diff --git a/web_poet/rules.py b/web_poet/rules.py index fdd5dbd0..43f8c462 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -40,7 +40,7 @@ class ApplyRule: pattern represented by the ``for_patterns`` attribute is matched. * ``instead_of`` - *(optional)* The Page Object that will be **replaced** with the Page Object specified via the ``use`` parameter. - * ``to_return`` - *(optional)* The Item class that marks the Page Object + * ``to_return`` - *(optional)* The Item Class that marks the Page Object to be **used** which is capable of returning that Item Class. * ``meta`` - *(optional)* Any other information you may want to store. This doesn't do anything for now but may be useful for future API updates. @@ -92,7 +92,7 @@ class PageObjectRegistry(dict): from web_poet import handle_urls, default_registry, WebPage @handle_urls("example.com", instead_of=ProductPageObject) - class ExampleComProductPage(WebPage): + class ExampleComProductPage(WebPage[ProductItem]): ... override_rules = default_registry.get_rules() @@ -160,7 +160,7 @@ def handle_urls( of the `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more information about them. - This decorator is able to derive the item class returned by the Page + This decorator is able to derive the Item Class returned by the Page Object (see :ref:`item-class-example` section for some examples). This is important since it marks what type of item the Page Object is capable of returning for the given URL patterns. For certain advanced cases, you can From bce97be36bc4c6a6e2ca5889d3a2f2c2e440fef0 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 19 Oct 2022 13:33:21 +0800 Subject: [PATCH 32/46] improve docstring of 'search_rules()' --- web_poet/rules.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index 43f8c462..a1cace39 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -241,15 +241,17 @@ def get_overrides(self) -> List[ApplyRule]: return self.get_rules() def search_rules(self, **kwargs) -> List[ApplyRule]: - """Return any :class:`ApplyRule` that has any of its attributes - match the rules inside the registry. + """Return any :class:`ApplyRule` from the registry that matches with all + of the provided attributes. Sample usage: .. code-block:: python rules = registry.search_rules(use=ProductPO, instead_of=GenericPO) - print(len(rules)) # 1 + print(len(rules)) # 1 + print(rules[0].use) # ProductPO + print(rules[0].instead_of) # GenericPO """ From 4e00ea8c22054adfbb858e9160eb1fd6af9f1a82 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevin@zyte.com> Date: Tue, 25 Oct 2022 14:15:52 +0800 Subject: [PATCH 33/46] Improve the docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves <adrian@chaves.io> --- CHANGELOG.rst | 2 +- docs/advanced/fields.rst | 2 +- docs/intro/overrides.rst | 36 ++++++++++++++++++++---------------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index c6a79560..c283579a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,7 +16,7 @@ TBD * Modify the call signature and behavior of ``handle_urls``: - * New ``instead_of`` parameter which does the same thing with ``overrides``. + * New ``instead_of`` parameter which does the same thing as ``overrides``. * The old ``overrides`` parameter is not required anymore as it's set for deprecation. * It sets a ``to_return`` parameter when creating ``ApplyRule`` based on the diff --git a/docs/advanced/fields.rst b/docs/advanced/fields.rst index 25467819..85073d04 100644 --- a/docs/advanced/fields.rst +++ b/docs/advanced/fields.rst @@ -181,7 +181,7 @@ processing, so it's preferable to use field processors instead. .. _item-classes: -Item classes +Item Classes ------------ In all previous examples, ``to_item`` methods are returning ``dict`` diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 452a4f7d..c58f6233 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -122,9 +122,9 @@ The code above declares that: - :sub:`(NO match) https://www.uk.dual.example/new-offers/fitness/?pid=892` - On the other hand, ``AnotherExampleProductPage`` is used instead of - ``GenericProductPage`` when we're handling pages that match with the - ``another.example`` URL Pattern which doesn't contain ``/digital-goods/`` - in its URL path. + ``GenericProductPage`` when we're handling pages that match the + ``another.example`` URL Pattern, which doesn't contain + ``/digital-goods/`` in its URL path. .. tip:: @@ -169,10 +169,8 @@ the following: Let's break this example down: - The URL patterns are exactly the same as with the previous code example. - - The ``@handle_urls`` is able to derive the returned Item Class (i.e. - ``Product``) from the respective Page Object. For advanced use cases, you - can pass the ``to_return`` parameter directly to the ``@handle_urls`` - decorator where it replaces any derived values. + - The ``@handle_urls`` decorator determines the Item Class to return (i.e. + ``Product``) from the decorated Page Object. - The ``instead_of`` parameter can be omitted in lieu of the derived Item Class from the Page Object which becomes the ``to_return`` attribute in :class:`~.ApplyRule` instances. This means that: @@ -188,17 +186,23 @@ Let's break this example down: Item Class is returned by the ``DualExampleProductPage``'s ``to_item()`` method instead. -This has the same concept as with the Page Object Overrides but the context changes -from using a Page Object into Item Classes instead (returned by the ``to_item()`` -method). +Specifying the Item Class that a Page Object returns makes it possible for +web-poet frameworks to make Page Object usage transparent to end users. -The upside here is that you're able to directly access the item extracted by the -Page Object for the given site. There's no need to directly call the ``to_item()`` -method to receive it. +For example, a web-poet framework could implement a function like: -However, one downside to this approach is that you lose access to the actual Page -Object. Aside from the item extracted by the Page Object, there might be some -other convenience methods or other data from it that you want to access. +.. code-block:: python + + item = get_item(url, item_class=Product) + +Here there is no reference to the Page Object being used underneath, you only +need to indicate the desired Item Class, and the web-poet framework +automatically determines the Page Object to use based on the specified URL and +the specified Item Class. + +Note, however, that web-poet frameworks are encouraged to also allow getting a +Page Object instead of an Item Class instance, for scenarios where end users +wish access to Page Object attributes and methods. .. _combination: From 59381a59e7797d606a7d54926fca8a818cb5a72c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Tue, 25 Oct 2022 14:57:01 +0800 Subject: [PATCH 34/46] add reference link to Page Objects in Overrides tutorial --- docs/intro/overrides.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index c58f6233..8ea13b53 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -4,9 +4,9 @@ Overrides ========= Overrides are rules represented by a list of :class:`~.ApplyRule` which -associates which URL patterns a particular Page Object would be used. The URL -matching rules is handled by another library called -`url-matcher <https://url-matcher.readthedocs.io>`_. +associates which URL patterns a particular Page Object (see :ref:`Page Objects +introduced here <from-ground-up>`) would be used. The URL matching rules is +handled by another library called `url-matcher <https://url-matcher.readthedocs.io>`_. Using such rules establishes the core concept of Overrides wherein a developer could declare that for a given set of URL patterns, a specific Page Object must From 076e7bb5ae74b700789ea8a2e74882d66253747c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 26 Oct 2022 14:35:33 +0800 Subject: [PATCH 35/46] remove mention of 'to_return' in @handle_url doc examples --- docs/intro/overrides.rst | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 8ea13b53..25c825c5 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -223,8 +223,8 @@ either contexts of Page Objects and Item Classes. return Product(product_title=self.css("title::text").get()) - @handle_urls("some.example", instead_of=GenericProductPage, to_return=Product) - class ExampleProductPage(WebPage): + @handle_urls("some.example", instead_of=GenericProductPage) + class ExampleProductPage(WebPage[Product]): ... # more specific parsing @@ -237,34 +237,6 @@ either contexts of Page Objects and Item Classes. class DualExampleProductPage(WebPage[SimilarProduct]): ... # more specific parsing -To recap some previously mentioned behaviors: - - - In the ``@handle_urls`` decorator for ``ExampleProductPage``, we needed to - pass the ``to_return`` parameter since the Page Object didn't specify the - Item Class that it returns (i.e. ``class ExampleProductPage(WebPage[Product])``). - If it did, we could omit the ``to_return`` parameter entirely since it's - able to automatically derive the Item Class. - - The recommended way is to avoid using the ``to_return`` parameter in - ``@handle_urls`` entirely (similar to how ``AnotherExampleProductPage`` - and ``DualExampleProductPage`` does it). - -.. warning:: - - Only use the ``to_return`` parameter in the ``@handle_urls`` decorator for - more advanced use cases since it presents the risk of having the Item Class - returned by the Page Object diverge with the actual ``to_return`` value in - the :class:`~.ApplyRule`. For example: - - .. code-block:: python - - @handle_urls("some.example", to_return=DifferentProduct) - class ProductPage(WebPage[Product]) - ... # more specific parsing - - The Page Object returns a ``Product`` item but the created :class:`~.ApplyRule` - has ``to_return=DifferentProduct``. - - See the next :ref:`retrieving-overrides` section to observe what are the actual :class:`~.ApplyRule` that were created by the ``@handle_urls`` decorators. From 1419f7a30aeeda580c9d3560de016bfde00b598f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 26 Oct 2022 15:26:06 +0800 Subject: [PATCH 36/46] improve tests --- tests/po_lib_to_return/__init__.py | 22 ++++++ tests/test_rules.py | 113 ++++++++++++++++++++++++++--- 2 files changed, 125 insertions(+), 10 deletions(-) diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index 17d2e69a..e87a0f18 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -10,6 +10,12 @@ class Product: price: float +@attrs.define +class ProductSeparate: + name: str + price: float + + @attrs.define class ProductSimilar: name: str @@ -72,6 +78,22 @@ def name(self) -> str: return "improved name" +@handle_urls("example.com", instead_of=ProductPage) +class SeparateProductPage(ItemPage[ProductSeparate]): + """Same case as with ``ImprovedProductPage`` but it doesn't inherit from + ``ProductPage``. + """ + + expected_instead_of = ProductPage + expected_patterns = Patterns(["example.com"]) + expected_to_return = ProductSeparate + expected_meta = {} + + @field + def name(self) -> str: + return "separate name" + + @handle_urls("example.com", instead_of=ProductPage) class SimilarProductPage(ProductPage, Returns[ProductSimilar]): """A custom PO inheriting from a base PO returning the same fields but in diff --git a/tests/test_rules.py b/tests/test_rules.py index dd9826dc..ffb7a348 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -2,7 +2,12 @@ import pytest from url_matcher import Patterns -from tests.po_lib import POTopLevel1, POTopLevel2, POTopLevelOverriden2 +from tests.po_lib import ( + POTopLevel1, + POTopLevel2, + POTopLevelOverriden1, + POTopLevelOverriden2, +) from tests.po_lib.a_module import POModule, POModuleOverriden from tests.po_lib.nested_package import PONestedPkg from tests.po_lib.nested_package.a_nested_module import PONestedModule @@ -17,6 +22,7 @@ Product, ProductPage, ProductSimilar, + SeparateProductPage, SimilarProductPage, SomePage, ) @@ -42,6 +48,7 @@ PONestedPkg, PONestedModule, ProductPage, + SeparateProductPage, SimilarProductPage, SomePage, } @@ -53,17 +60,18 @@ def test_apply_rule_uniqueness() -> None: """ patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) + patterns_b = Patterns(include=["example.com/b"]) rule1 = ApplyRule( for_patterns=patterns, use=POTopLevel1, - instead_of=POTopLevelOverriden2, + instead_of=POTopLevelOverriden1, meta={"key_1": 1}, ) rule2 = ApplyRule( for_patterns=patterns, use=POTopLevel1, - instead_of=POTopLevelOverriden2, + instead_of=POTopLevelOverriden1, meta={"key_2": 2}, ) # The ``meta`` parameter is ignored in the hash. @@ -72,18 +80,33 @@ def test_apply_rule_uniqueness() -> None: rule1 = ApplyRule( for_patterns=patterns, use=POTopLevel1, - instead_of=POTopLevelOverriden2, + instead_of=POTopLevelOverriden1, to_return=Product, ) rule2 = ApplyRule( for_patterns=patterns, use=POTopLevel1, - instead_of=POTopLevelOverriden2, + instead_of=POTopLevelOverriden1, to_return=ProductSimilar, ) # A different Item Class results in different hash. assert hash(rule1) != hash(rule2) + rule1 = ApplyRule( + for_patterns=patterns, + use=POTopLevel1, + instead_of=POTopLevelOverriden1, + to_return=Product, + ) + rule2 = ApplyRule( + for_patterns=patterns_b, + use=POTopLevel2, + instead_of=POTopLevelOverriden2, + to_return=ProductSimilar, + ) + # Totally different params affect the hash completely + assert hash(rule1) != hash(rule2) + def test_apply_rule_immutability() -> None: patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) @@ -91,11 +114,17 @@ def test_apply_rule_immutability() -> None: rule = ApplyRule( for_patterns=patterns, use=POTopLevel1, - instead_of=POTopLevelOverriden2, + instead_of=POTopLevelOverriden1, ) with pytest.raises(attrs.exceptions.FrozenInstanceError): - rule.use = POModule # type: ignore[misc] + rule.use = Patterns(include=["example.com/"]) # type: ignore[misc] + + with pytest.raises(attrs.exceptions.FrozenInstanceError): + rule.use = POTopLevel2 # type: ignore[misc] + + with pytest.raises(attrs.exceptions.FrozenInstanceError): + rule.use = POTopLevelOverriden2 # type: ignore[misc] def test_apply_rule_converter_on_pattern() -> None: @@ -171,6 +200,10 @@ def test_registry_get_overrides_deprecation() -> None: # It should still work as usual assert len(rules) == len(default_registry.get_rules()) + # but the rules from ``.get_overrides()`` should return ``ApplyRule`` and + # not the old ``OverrideRule``. + assert all([r for r in rules if isinstance(r, ApplyRule)]) + def test_consume_module_not_existing() -> None: with pytest.raises(ImportError): @@ -200,8 +233,22 @@ def test_registry_search_rules() -> None: # param: to_return rules = default_registry.search_rules(to_return=Product) - assert len(rules) == 3 - assert all([r for r in rules if r.use == Product]) + assert rules == [ + ApplyRule("example.com", use=ProductPage, to_return=Product), + ApplyRule( + "example.com", + use=ImprovedProductPage, + instead_of=ProductPage, + to_return=Product, + ), + ApplyRule( + "example.com", + # mypy complains here since it's expecting a container class when + # declared, i.e, ``ItemPage[SomeItem]`` + use=CustomProductPageDataTypeOnly, # type: ignore[arg-type] + to_return=Product, + ), + ] # params: to_return and use rules = default_registry.search_rules(to_return=Product, use=ImprovedProductPage) @@ -223,6 +270,10 @@ def test_registry_search_overrides_deprecation() -> None: assert len(rules) == 1 assert rules[0].use == POTopLevel2 + # The rules from ``.get_overrides()`` should return ``ApplyRule`` and + # not the old ``OverrideRule``. + assert isinstance(rules[0], ApplyRule) + def test_from_apply_rules() -> None: rules = [ @@ -239,7 +290,7 @@ def test_from_apply_rules() -> None: assert default_registry.get_rules() != rules -def test_from_override_rules_deprecation() -> None: +def test_from_override_rules_deprecation_using_ApplyRule() -> None: rules = [ ApplyRule( for_patterns=Patterns(include=["sample.com"]), @@ -259,7 +310,29 @@ def test_from_override_rules_deprecation() -> None: assert default_registry.get_rules() != rules +def test_from_override_rules_deprecation_using_OverrideRule() -> None: + rules = [ + OverrideRule( + for_patterns=Patterns(include=["sample.com"]), + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + ) + ] + + msg = ( + "The 'from_override_rules' method is deprecated. " + "Use 'from_apply_rules' instead." + ) + with pytest.warns(DeprecationWarning, match=msg): + registry = PageObjectRegistry.from_override_rules(rules) + + assert registry.get_rules() == rules + assert default_registry.get_rules() != rules + + def test_handle_urls_deprecation() -> None: + before_count = len(default_registry.get_rules()) + msg = ( "The 'overrides' parameter in @handle_urls is deprecated. Use the " "'instead_of' parameter." @@ -270,6 +343,26 @@ def test_handle_urls_deprecation() -> None: class PageWithDeprecatedOverrides: ... + # Despite the deprecation, it should still properly add the rule in the + # registry. + after_count = len(default_registry.get_rules()) + assert after_count == before_count + 1 + + # The added rule should have its deprecated 'overrides' parameter converted + # into the new 'instead_of' parameter. + rules = default_registry.search_rules( + instead_of=CustomProductPage, use=PageWithDeprecatedOverrides + ) + assert rules == [ + ApplyRule( + "example.com", + instead_of=CustomProductPage, + # mypy complains here since it's expecting a container class when + # declared, i.e, ``ItemPage[SomeItem]`` + use=PageWithDeprecatedOverrides, # type: ignore[arg-type] + ) + ] + def test_override_rule_deprecation() -> None: msg = ( From 776cf0dd16a466504e64875307da86d4ee2a6115 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 26 Oct 2022 18:43:25 +0800 Subject: [PATCH 37/46] improve docstrings and warning messages --- web_poet/rules.py | 59 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index a1cace39..e47b4366 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -47,14 +47,54 @@ class ApplyRule: The main functionality of this class lies in the ``instead_of`` and ``to_return`` parameters. Should both of these be omitted, then :class:`~.ApplyRule` simply - tags which URL patterns the given Page Object is expected to be used. - - As much as possible, the ``to_return`` parameter should capture the Item Class - that the Page Object is capable of returning. Before passing it to :class:`~.ApplyRule`, - the ``to_return`` value is primarily derived from the return class specified - from Page Objects that are subclasses of :class:`~.ItemPage`. However, a - special case exists when a Page Object returns a ``dict`` as an item but then - the rule should have ``to_return=None`` and **NOT** ``to_return=dict``. + tags which URL patterns the given Page Object defined in ``use`` is expected + to be used. It works as: + + 1. Given a URL, match it against the ``for_patterns`` from the registry + rules. + 2. This could give us a collection of rules. We need to select one based + on the highest priority set by `url-matcher`_. + 3. When a single rule has been selected, use the the Page Object specified + in its ``use`` parameter. + + If ``instead_of=None``, this simply means that the Page Object assigned in + the ``use`` parameter will be utilized for all URLs matching the URL pattern + in ``for_patterns``. However, if ``instead_of=ReplacedPageObject``, then it + adds the expectation that the ``ReplacedPageObject`` wouldn't be used for + the given URLs matching ``for_patterns`` since the Page Object in ``use`` + will replace it. It works as: + + 1. Suppose that we have a rule that has ``use=ReplacedPageObject`` which + we want to use against a URL that matches against ``for_patterns``. + 2. Before using it, all of the rules from the registry must be checked if + other rules has ``instead_of=ReplacedPageObject`` and matches the + URL patterns in ``for_patterns``. + 3. If there are, these rules supersedes the original rule from #1. + 4. After selecting one based on the highest priority set by `url-matcher`_, + the Page Object declared in ``use`` should be used instead of + ``ReplacedPageObject``. + + The ``to_return`` parameter should capture the Item Class that the Page Object + is capable of returning. Before passing it to :class:`~.ApplyRule`, the + ``to_return`` value is primarily derived from the return class specified + from Page Objects that are subclasses of :class:`~.ItemPage` (see this + :ref:`example <item-class-example>`). However, a special case exists when a + Page Object returns a ``dict`` as an item but then the rule should have + ``to_return=None`` and **NOT** ``to_return=dict``. + + The ``to_return`` parameter is used as a shortcut to directly retrieve the + item from the Page Object to be used for a given URL. It works as: + + 1. Given a URL and and Item Class that we want, match it respectively + against ``for_patterns`` and ``to_return`` from the registry rules. + 2. This could give us a collection of rules. We need to select one based + on the highest priority set by `url-matcher`_. + 3. When a single rule has been selected, create an instance of the Page + Object specified in its ``use`` parameter. + 4. Finally, call the ``.to_item()`` method of the Page Object to retrieve + an instance of the Item Class. + + Using the ``to_return`` parameter basically adds the convenient step #4 above. More information regarding its usage in :ref:`intro-overrides`. @@ -192,7 +232,8 @@ def wrapper(cls): if overrides is not None: msg = ( "The 'overrides' parameter in @handle_urls is deprecated. " - "Use the 'instead_of' parameter." + "Use the 'instead_of' parameter instead. If both 'instead_of' " + "and 'overrides' are provided, the latter is ignored." ) warnings.warn(msg, DeprecationWarning, stacklevel=2) From 42bd1233098ec752f7a81313eb95ca9d798143f6 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevin@zyte.com> Date: Wed, 26 Oct 2022 18:45:33 +0800 Subject: [PATCH 38/46] Fix test case when ensuring that ApplyRule is frozen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrián Chaves <adrian@chaves.io> --- tests/test_rules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index ffb7a348..68781558 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -118,13 +118,13 @@ def test_apply_rule_immutability() -> None: ) with pytest.raises(attrs.exceptions.FrozenInstanceError): - rule.use = Patterns(include=["example.com/"]) # type: ignore[misc] + rule.for_patterns = Patterns(include=["example.com/"]) # type: ignore[misc] with pytest.raises(attrs.exceptions.FrozenInstanceError): rule.use = POTopLevel2 # type: ignore[misc] with pytest.raises(attrs.exceptions.FrozenInstanceError): - rule.use = POTopLevelOverriden2 # type: ignore[misc] + rule.instead_of = POTopLevelOverriden2 # type: ignore[misc] def test_apply_rule_converter_on_pattern() -> None: From 36cd866263f2081fa221a00c218568c5a4026fd1 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 26 Oct 2022 18:54:17 +0800 Subject: [PATCH 39/46] update tests to check each param change on hash() --- tests/test_rules.py | 49 +++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index 68781558..cdebe78b 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -77,35 +77,28 @@ def test_apply_rule_uniqueness() -> None: # The ``meta`` parameter is ignored in the hash. assert hash(rule1) == hash(rule2) - rule1 = ApplyRule( - for_patterns=patterns, - use=POTopLevel1, - instead_of=POTopLevelOverriden1, - to_return=Product, - ) - rule2 = ApplyRule( - for_patterns=patterns, - use=POTopLevel1, - instead_of=POTopLevelOverriden1, - to_return=ProductSimilar, - ) - # A different Item Class results in different hash. - assert hash(rule1) != hash(rule2) + params = [ + { + "for_patterns": patterns, + "use": POTopLevel1, + "instead_of": POTopLevelOverriden1, + "to_return": Product, + }, + { + "for_patterns": patterns_b, + "use": POTopLevel2, + "instead_of": POTopLevelOverriden2, + "to_return": ProductSimilar, + }, + ] - rule1 = ApplyRule( - for_patterns=patterns, - use=POTopLevel1, - instead_of=POTopLevelOverriden1, - to_return=Product, - ) - rule2 = ApplyRule( - for_patterns=patterns_b, - use=POTopLevel2, - instead_of=POTopLevelOverriden2, - to_return=ProductSimilar, - ) - # Totally different params affect the hash completely - assert hash(rule1) != hash(rule2) + for change in params[0].keys(): + # Changing any one of the params should result in a hash mismatch + rule1 = ApplyRule(**params[0]) + kwargs = params[0].copy() + kwargs.update({change: params[1][change]}) + rule2 = ApplyRule(**kwargs) + assert hash(rule1) != hash(rule2) def test_apply_rule_immutability() -> None: From 383b4f74261388d149138804f4d0d41791f22691 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal <kevinoxy@gmail.com> Date: Wed, 26 Oct 2022 23:16:49 +0800 Subject: [PATCH 40/46] update 'Item Class' to 'item class' --- docs/advanced/fields.rst | 22 ++++++++++---------- docs/intro/overrides.rst | 32 +++++++++++++++--------------- tests/po_lib_to_return/__init__.py | 8 ++++---- web_poet/pages.py | 2 +- web_poet/rules.py | 14 ++++++------- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/docs/advanced/fields.rst b/docs/advanced/fields.rst index 85073d04..64e39619 100644 --- a/docs/advanced/fields.rst +++ b/docs/advanced/fields.rst @@ -185,7 +185,7 @@ Item Classes ------------ In all previous examples, ``to_item`` methods are returning ``dict`` -instances. It is common to use Item Classes (e.g. dataclasses or +instances. It is common to use item classes (e.g. dataclasses or attrs instances) instead of unstructured dicts to hold the data: .. code-block:: python @@ -209,7 +209,7 @@ attrs instances) instead of unstructured dicts to hold the data: ) :mod:`web_poet.fields` supports it, by allowing to parametrize -:class:`~.ItemPage` with an Item Class: +:class:`~.ItemPage` with an item class: .. code-block:: python @@ -217,13 +217,13 @@ attrs instances) instead of unstructured dicts to hold the data: class ProductPage(ItemPage[Product]): # ... -When :class:`~.ItemPage` is parametrized with an Item Class, +When :class:`~.ItemPage` is parametrized with an item class, its ``to_item()`` method starts to return item instances, instead of ``dict`` instances. In the example above ``ProductPage.to_item`` method returns ``Product`` instances. -Defining an Item Class may be an overkill if you only have a single Page Object, -but Item Classes are of a great help when +Defining an item class may be an overkill if you only have a single Page Object, +but item classes are of a great help when * you need to extract data in the same format from multiple websites, or * if you want to define the schema upfront. @@ -231,7 +231,7 @@ but Item Classes are of a great help when Error prevention ~~~~~~~~~~~~~~~~ -Item Classes play particularly well with the +Item classes play particularly well with the :func:`@field <web_poet.fields.field>` decorator, preventing some of the errors, which may happen if results are plain "dicts". @@ -256,7 +256,7 @@ Consider the following badly written page object: def nane(self): return self.response.css(".name").get() -Because the ``Product`` Item Class is used, a typo ("nane" instead of "name") +Because the ``Product`` item class is used, a typo ("nane" instead of "name") is detected at runtime: the creation of a ``Product`` instance would fail with a ``TypeError``, because of the unexpected keyword argument "nane". @@ -265,7 +265,7 @@ detected: the ``price`` argument is required, but there is no extraction method this attribute, so ``Product.__init__`` will raise another ``TypeError``, indicating that a required argument is missing. -Without an Item Class, none of these errors are detected. +Without an item class, none of these errors are detected. Changing Item Class ~~~~~~~~~~~~~~~~~~~ @@ -335,7 +335,7 @@ to the item: # ... Note how :class:`~.Returns` is used as one of the base classes of -``CustomFooPage``; it allows to change the Item Class returned by a page object. +``CustomFooPage``; it allows to change the item class returned by a page object. Removing fields (as well as renaming) is a bit more tricky. @@ -370,13 +370,13 @@ is passed, and ``name`` is the only field ``CustomItem`` supports. To recap: -* Use ``Returns[NewItemType]`` to change the Item Class in a subclass. +* Use ``Returns[NewItemType]`` to change the item class in a subclass. * Don't use ``skip_nonitem_fields=True`` when your Page Object corresponds to an item exactly, or when you're only adding fields. This is a safe approach, which allows to detect typos in field names, even for optional fields. * Use ``skip_nonitem_fields=True`` when it's possible for the Page Object - to contain more ``@fields`` than defined in the Item Class, e.g. because + to contain more ``@fields`` than defined in the item class, e.g. because Page Object is inherited from some other base Page Object. Caching diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 25c825c5..1831bfa4 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -13,7 +13,7 @@ could declare that for a given set of URL patterns, a specific Page Object must be used instead of another Page Object. The :class:`~.ApplyRule` also supports pointing to the item returned by a specific -Page Object if it both matches the URL pattern and the Item Class specified in the +Page Object if it both matches the URL pattern and the item class specified in the rule. This enables **web-poet** to be used effectively by other frameworks like @@ -56,7 +56,7 @@ Creating Overrides ------------------ To simplify the code examples in the next few subsections, let's assume that -these Item Classes have been predefined: +these item classes have been predefined: .. code-block:: python @@ -105,8 +105,8 @@ Let's take a look at how the following code is structured: The code above declares that: - - The Page Objects return ``Product`` and ``SimilarProduct`` Item Classes. - Returning Item Classes is a preferred approach as explained in the + - The Page Objects return ``Product`` and ``SimilarProduct`` item classes. + Returning item classes is a preferred approach as explained in the :ref:`web-poet-fields` section. - For sites that match the ``some.example`` pattern, ``ExampleProductPage`` would be used instead of ``GenericProductPage``. @@ -138,7 +138,7 @@ Item Class ~~~~~~~~~~ An alternative approach for the Page Object Overrides example above is to specify -the returned Item Class. For example, we could change the previous example into +the returned item class. For example, we could change the previous example into the following: @@ -169,24 +169,24 @@ the following: Let's break this example down: - The URL patterns are exactly the same as with the previous code example. - - The ``@handle_urls`` decorator determines the Item Class to return (i.e. + - The ``@handle_urls`` decorator determines the item class to return (i.e. ``Product``) from the decorated Page Object. - The ``instead_of`` parameter can be omitted in lieu of the derived Item Class from the Page Object which becomes the ``to_return`` attribute in :class:`~.ApplyRule` instances. This means that: - - If a ``Product`` Item Class is requested for URLs matching with the - "some.example" pattern, then the ``Product`` Item Class would come from + - If a ``Product`` item class is requested for URLs matching with the + "some.example" pattern, then the ``Product`` item class would come from the ``to_item()`` method of ``ExampleProductPage``. - Similarly, if a page with a URL matches with "another.example" without - the "/digital-goods/" path, then the ``Product`` Item Class comes from + the "/digital-goods/" path, then the ``Product`` item class comes from the ``AnotherExampleProductPage`` Page Object. - - However, if a ``Product`` Item Class is requested matching with the URL + - However, if a ``Product`` item class is requested matching with the URL pattern of "dual.example/shop/?product=*", a ``SimilarProduct`` - Item Class is returned by the ``DualExampleProductPage``'s ``to_item()`` + item class is returned by the ``DualExampleProductPage``'s ``to_item()`` method instead. -Specifying the Item Class that a Page Object returns makes it possible for +Specifying the item class that a Page Object returns makes it possible for web-poet frameworks to make Page Object usage transparent to end users. For example, a web-poet framework could implement a function like: @@ -196,12 +196,12 @@ For example, a web-poet framework could implement a function like: item = get_item(url, item_class=Product) Here there is no reference to the Page Object being used underneath, you only -need to indicate the desired Item Class, and the web-poet framework +need to indicate the desired item class, and the web-poet framework automatically determines the Page Object to use based on the specified URL and -the specified Item Class. +the specified item class. Note, however, that web-poet frameworks are encouraged to also allow getting a -Page Object instead of an Item Class instance, for scenarios where end users +Page Object instead of an item class instance, for scenarios where end users wish access to Page Object attributes and methods. @@ -211,7 +211,7 @@ Combination ~~~~~~~~~~~ Of course, you can use the combination of both which enables you to specify in -either contexts of Page Objects and Item Classes. +either contexts of Page Objects and item classes. .. code-block:: python diff --git a/tests/po_lib_to_return/__init__.py b/tests/po_lib_to_return/__init__.py index e87a0f18..935e974e 100644 --- a/tests/po_lib_to_return/__init__.py +++ b/tests/po_lib_to_return/__init__.py @@ -97,7 +97,7 @@ def name(self) -> str: @handle_urls("example.com", instead_of=ProductPage) class SimilarProductPage(ProductPage, Returns[ProductSimilar]): """A custom PO inheriting from a base PO returning the same fields but in - a different Item Class. + a different item class. """ expected_instead_of = ProductPage @@ -109,7 +109,7 @@ class SimilarProductPage(ProductPage, Returns[ProductSimilar]): @handle_urls("example.com", instead_of=ProductPage) class MoreProductPage(ProductPage, Returns[ProductMoreFields]): """A custom PO inheriting from a base PO returning more items using a - different Item Class. + different item class. """ expected_instead_of = ProductPage @@ -127,7 +127,7 @@ class LessProductPage( ProductPage, Returns[ProductFewerFields], skip_nonitem_fields=True ): """A custom PO inheriting from a base PO returning less items using a - different Item Class. + different item class. """ expected_instead_of = ProductPage @@ -143,7 +143,7 @@ def brand(self) -> str: @handle_urls("example.com", instead_of=ProductPage, to_return=ProductSimilar) class CustomProductPage(ProductPage, Returns[Product]): """A custom PO inheriting from a base PO returning the same fields but in - a different Item Class. + a different item class. This PO is the same with ``SimilarProductPage`` but passes a ``to_return`` in the ``@handle_urls`` decorator. diff --git a/web_poet/pages.py b/web_poet/pages.py index f54e86ba..77b39af3 100644 --- a/web_poet/pages.py +++ b/web_poet/pages.py @@ -41,7 +41,7 @@ def is_injectable(cls: typing.Any) -> bool: class Returns(typing.Generic[ItemT]): - """Inherit from this generic mixin to change the Item Class used by + """Inherit from this generic mixin to change the item class used by :class:`~.ItemPage`""" @property diff --git a/web_poet/rules.py b/web_poet/rules.py index e47b4366..47564e96 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -40,8 +40,8 @@ class ApplyRule: pattern represented by the ``for_patterns`` attribute is matched. * ``instead_of`` - *(optional)* The Page Object that will be **replaced** with the Page Object specified via the ``use`` parameter. - * ``to_return`` - *(optional)* The Item Class that marks the Page Object - to be **used** which is capable of returning that Item Class. + * ``to_return`` - *(optional)* The item class that marks the Page Object + to be **used** which is capable of returning that item class. * ``meta`` - *(optional)* Any other information you may want to store. This doesn't do anything for now but may be useful for future API updates. @@ -74,7 +74,7 @@ class ApplyRule: the Page Object declared in ``use`` should be used instead of ``ReplacedPageObject``. - The ``to_return`` parameter should capture the Item Class that the Page Object + The ``to_return`` parameter should capture the item class that the Page Object is capable of returning. Before passing it to :class:`~.ApplyRule`, the ``to_return`` value is primarily derived from the return class specified from Page Objects that are subclasses of :class:`~.ItemPage` (see this @@ -85,14 +85,14 @@ class ApplyRule: The ``to_return`` parameter is used as a shortcut to directly retrieve the item from the Page Object to be used for a given URL. It works as: - 1. Given a URL and and Item Class that we want, match it respectively + 1. Given a URL and and item class that we want, match it respectively against ``for_patterns`` and ``to_return`` from the registry rules. 2. This could give us a collection of rules. We need to select one based on the highest priority set by `url-matcher`_. 3. When a single rule has been selected, create an instance of the Page Object specified in its ``use`` parameter. 4. Finally, call the ``.to_item()`` method of the Page Object to retrieve - an instance of the Item Class. + an instance of the item class. Using the ``to_return`` parameter basically adds the convenient step #4 above. @@ -200,7 +200,7 @@ def handle_urls( of the `url-matcher <https://url-matcher.readthedocs.io/>`_ package for more information about them. - This decorator is able to derive the Item Class returned by the Page + This decorator is able to derive the item class returned by the Page Object (see :ref:`item-class-example` section for some examples). This is important since it marks what type of item the Page Object is capable of returning for the given URL patterns. For certain advanced cases, you can @@ -216,7 +216,7 @@ def handle_urls( :param include: The URLs that should be handled by the decorated Page Object. :param instead_of: The Page Object that should be `replaced`. - :param to_return: The Item Class holding the data returned by the Page Object. + :param to_return: The item class holding the data returned by the Page Object. This could be omitted as it could be derived from the ``Returns[ItemClass]`` or ``ItemPage[ItemClass]`` declaration of the Page Object. See :ref:`item-classes` section. Code example in :ref:`combination` subsection. From 4755ce977dc95beca2523d62c8ee299cf5464fdb Mon Sep 17 00:00:00 2001 From: Mikhail Korobov <kmike84@gmail.com> Date: Thu, 27 Oct 2022 01:21:47 +0500 Subject: [PATCH 41/46] add an Overview section to the Overrides docs; rename them to Apply Rules --- docs/intro/overrides.rst | 119 ++++++++++++++++++++++++++++++++++----- 1 file changed, 106 insertions(+), 13 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 1831bfa4..8a882780 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -1,23 +1,116 @@ .. _intro-overrides: +Apply Rules +=========== + +Overview +-------- + +@handle_urls +~~~~~~~~~~~~ + +web-poet provides a :func:`~.handle_urls` decorator, which allows to +declare how the page objects can be used (applied): + +* for which websites / URL patterns they work, +* which data type (item classes) they can return, +* which page objects can they replace (override; more on this later). + +.. code-block:: python + + from web_poet import ItemPage, handle_urls + from my_items import MyItem + + @handle_urls("example.com") + class MyPage(ItemPage[MyItem]): + # ... + + +``handle_urls("example.com")`` can serve as a documentation, but it also enables +getting the information about page objects programmatically. +The information about all page objects decorated with +:func:`~.handle_urls` is stored in ``web_poet.default_registry``, which is +an instance of :class:`~.PageObjectRegistry`. In the example above the +following :class:`~.ApplyRule` is added to the registry: + +.. code-block:: + + ApplyRule( + for_patterns=Patterns(include=('example.com',), exclude=(), priority=500), + use=<class 'MyPage'>, + instead_of=None, + to_return=<class 'MyItem'>, + meta={} + ) + +Note how ``rule.to_return`` is set to ``MyItem`` automatically. +This can be used by libraries like `scrapy-poet`_. For example, +if a spider needs to extract ``MyItem`` from some page on the ``example.com`` +website, `scrapy-poet`_ now knows that ``MyPage`` page object can be used. + +.. _scrapy-poet: https://scrapy-poet.readthedocs.io + +Specifying the URL patterns +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~handle_urls` decorator uses url-matcher_ library to define the +URL rules. Some examples: + +.. code-block:: python + + # page object can be applied on any URL from the example.com domain, + # or from any of its subdomains + @handle_urls("example.com") + + # page object can be applied on example.com pages under /products/ path + @handle_urls("example.com/products/") + + # page object can be applied on any URL from example.com, but only if + # it contains "productId=..." in the query string + @handle_urls("example.com?productId=*") + +Please consult with the url-matcher_ documentation for more; it is pretty +flexible. It is possible to exclude patterns, use wildcards, require certain +query parameters to be present and ignore others, etc.; +unlike regexes, this mini-language "understands" the URL structure. + +.. _url-matcher: https://url-matcher.readthedocs.io + Overrides -========= +~~~~~~~~~ + +:func:`~.handle_urls` can be used to declare that a particular Page Object +could (and should) be used *instead of* some other Page Object on +certain URL patterns: + +.. code-block:: python + + from web_poet import ItemPage, handle_urls + from my_items import Product + from my_pages import DefaultProductPage + + @handle_urls("site1.example.com", instead_of=DefaultProductPage) + class Site1ProductPage(ItemPage[Product]): + # ... + + @handle_urls("site2.example.com", instead_of=DefaultProductPage) + class Site2ProductPage(ItemPage[Product]): + # ... -Overrides are rules represented by a list of :class:`~.ApplyRule` which -associates which URL patterns a particular Page Object (see :ref:`Page Objects -introduced here <from-ground-up>`) would be used. The URL matching rules is -handled by another library called `url-matcher <https://url-matcher.readthedocs.io>`_. +This concept is a bit more advanced than the basic ``handle_urls`` usage +("this Page Object can return MyItem on example.com website"). -Using such rules establishes the core concept of Overrides wherein a developer -could declare that for a given set of URL patterns, a specific Page Object must -be used instead of another Page Object. +A common use case is a "generic", or a "template" spider, which uses some +default implementation of the extraction, and allows to replace it +("override") on specific websites or URL patterns. -The :class:`~.ApplyRule` also supports pointing to the item returned by a specific -Page Object if it both matches the URL pattern and the item class specified in the -rule. +This default (``DefaultProductPage`` in the example) can be based on +semantic markup, Machine Learning, heuristics, or just be empty. Page Objects which +can be used instead of the default (``Site1ProductPage``, ``Site2ProductPage``) +are commonly written using XPath or CSS selectors, with website-specific rules. -This enables **web-poet** to be used effectively by other frameworks like -`scrapy-poet <https://scrapy-poet.readthedocs.io>`_. +Libraries like scrapy-poet_ allows to create such "generic" spiders by +using the information declared via ``handle_urls(..., instead_of=...)``. Example Use Case ---------------- From 7240c9e37462d02671bb5f06f7ef9281514de76a Mon Sep 17 00:00:00 2001 From: Mikhail Korobov <kmike84@gmail.com> Date: Thu, 27 Oct 2022 01:22:06 +0500 Subject: [PATCH 42/46] bump Sphinx version --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 5c5d4d38..0ff12812 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,2 +1,2 @@ -Sphinx==5.0.1 +Sphinx==5.3.0 sphinx-rtd-theme==1.0.0 From 85b9b7b5c03bbf5c9adfd82772b9cb5defc85428 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov <kmike84@gmail.com> Date: Thu, 27 Oct 2022 01:43:14 +0500 Subject: [PATCH 43/46] simplify ApplyRule docstring I've removed the incorrect parts, and also removed some of the correct parts :) It seems this kind of documentation belongs somewhere else - it describes how the information stored in ApplyRule is used by other components. --- web_poet/rules.py | 64 +++++++++++------------------------------------ 1 file changed, 15 insertions(+), 49 deletions(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index 47564e96..5df962ff 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -40,61 +40,27 @@ class ApplyRule: pattern represented by the ``for_patterns`` attribute is matched. * ``instead_of`` - *(optional)* The Page Object that will be **replaced** with the Page Object specified via the ``use`` parameter. - * ``to_return`` - *(optional)* The item class that marks the Page Object - to be **used** which is capable of returning that item class. + * ``to_return`` - *(optional)* The item class which the **used** + Page Object is capable of returning. * ``meta`` - *(optional)* Any other information you may want to store. This doesn't do anything for now but may be useful for future API updates. The main functionality of this class lies in the ``instead_of`` and ``to_return`` parameters. Should both of these be omitted, then :class:`~.ApplyRule` simply tags which URL patterns the given Page Object defined in ``use`` is expected - to be used. It works as: - - 1. Given a URL, match it against the ``for_patterns`` from the registry - rules. - 2. This could give us a collection of rules. We need to select one based - on the highest priority set by `url-matcher`_. - 3. When a single rule has been selected, use the the Page Object specified - in its ``use`` parameter. - - If ``instead_of=None``, this simply means that the Page Object assigned in - the ``use`` parameter will be utilized for all URLs matching the URL pattern - in ``for_patterns``. However, if ``instead_of=ReplacedPageObject``, then it - adds the expectation that the ``ReplacedPageObject`` wouldn't be used for - the given URLs matching ``for_patterns`` since the Page Object in ``use`` - will replace it. It works as: - - 1. Suppose that we have a rule that has ``use=ReplacedPageObject`` which - we want to use against a URL that matches against ``for_patterns``. - 2. Before using it, all of the rules from the registry must be checked if - other rules has ``instead_of=ReplacedPageObject`` and matches the - URL patterns in ``for_patterns``. - 3. If there are, these rules supersedes the original rule from #1. - 4. After selecting one based on the highest priority set by `url-matcher`_, - the Page Object declared in ``use`` should be used instead of - ``ReplacedPageObject``. - - The ``to_return`` parameter should capture the item class that the Page Object - is capable of returning. Before passing it to :class:`~.ApplyRule`, the - ``to_return`` value is primarily derived from the return class specified - from Page Objects that are subclasses of :class:`~.ItemPage` (see this - :ref:`example <item-class-example>`). However, a special case exists when a - Page Object returns a ``dict`` as an item but then the rule should have - ``to_return=None`` and **NOT** ``to_return=dict``. - - The ``to_return`` parameter is used as a shortcut to directly retrieve the - item from the Page Object to be used for a given URL. It works as: - - 1. Given a URL and and item class that we want, match it respectively - against ``for_patterns`` and ``to_return`` from the registry rules. - 2. This could give us a collection of rules. We need to select one based - on the highest priority set by `url-matcher`_. - 3. When a single rule has been selected, create an instance of the Page - Object specified in its ``use`` parameter. - 4. Finally, call the ``.to_item()`` method of the Page Object to retrieve - an instance of the item class. - - Using the ``to_return`` parameter basically adds the convenient step #4 above. + to be used on. + + When ``to_return`` is not None (e.g. ``to_return=MyItem``), + the Page Object in ``use`` is declared as capable of returning a certain + item class (``MyItem``). + + When ``instead_of`` is not None (e.g. ``instead_of=ReplacedPageObject``), + the rule adds an expectation that the ``ReplacedPageObject`` wouldn't + be used for the URLs matching ``for_patterns``, since the Page Object + in ``use`` will replace it. + + If there are multuple rules which match a certain URL, the rule + to apply is picked based on the priorities set in ``for_patterns``. More information regarding its usage in :ref:`intro-overrides`. From ddaed2f444cd58e6389df9d0ae60cdae77ec1d1e Mon Sep 17 00:00:00 2001 From: Mikhail Korobov <kmike84@gmail.com> Date: Thu, 27 Oct 2022 02:08:26 +0500 Subject: [PATCH 44/46] typo fix --- docs/intro/overrides.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index 8a882780..b6239433 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -109,7 +109,7 @@ semantic markup, Machine Learning, heuristics, or just be empty. Page Objects wh can be used instead of the default (``Site1ProductPage``, ``Site2ProductPage``) are commonly written using XPath or CSS selectors, with website-specific rules. -Libraries like scrapy-poet_ allows to create such "generic" spiders by +Libraries like scrapy-poet_ allow to create such "generic" spiders by using the information declared via ``handle_urls(..., instead_of=...)``. Example Use Case From 660a8cd30bf4d2ed6ea011e074d25227140e08fc Mon Sep 17 00:00:00 2001 From: Mikhail Korobov <kmike84@gmail.com> Date: Thu, 27 Oct 2022 10:57:30 +0500 Subject: [PATCH 45/46] Apply suggestions from code review Co-authored-by: Kevin Lloyd Bernal <kevin@zyte.com> --- docs/intro/overrides.rst | 8 ++++---- web_poet/rules.py | 7 ++++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index b6239433..eb742e87 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -30,7 +30,7 @@ declare how the page objects can be used (applied): getting the information about page objects programmatically. The information about all page objects decorated with :func:`~.handle_urls` is stored in ``web_poet.default_registry``, which is -an instance of :class:`~.PageObjectRegistry`. In the example above the +an instance of :class:`~.PageObjectRegistry`. In the example above, the following :class:`~.ApplyRule` is added to the registry: .. code-block:: @@ -39,7 +39,7 @@ following :class:`~.ApplyRule` is added to the registry: for_patterns=Patterns(include=('example.com',), exclude=(), priority=500), use=<class 'MyPage'>, instead_of=None, - to_return=<class 'MyItem'>, + to_return=<class 'my_items.MyItem'>, meta={} ) @@ -98,13 +98,13 @@ certain URL patterns: # ... This concept is a bit more advanced than the basic ``handle_urls`` usage -("this Page Object can return MyItem on example.com website"). +("this Page Object can return ``MyItem`` on example.com website"). A common use case is a "generic", or a "template" spider, which uses some default implementation of the extraction, and allows to replace it ("override") on specific websites or URL patterns. -This default (``DefaultProductPage`` in the example) can be based on +This default page extraction (``DefaultProductPage`` in the example) can be based on semantic markup, Machine Learning, heuristics, or just be empty. Page Objects which can be used instead of the default (``Site1ProductPage``, ``Site2ProductPage``) are commonly written using XPath or CSS selectors, with website-specific rules. diff --git a/web_poet/rules.py b/web_poet/rules.py index 5df962ff..b1fd5fbc 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -41,7 +41,8 @@ class ApplyRule: * ``instead_of`` - *(optional)* The Page Object that will be **replaced** with the Page Object specified via the ``use`` parameter. * ``to_return`` - *(optional)* The item class which the **used** - Page Object is capable of returning. + * ``to_return`` - *(optional)* The item class that the Page Object specified + in ``use`` is capable of returning. * ``meta`` - *(optional)* Any other information you may want to store. This doesn't do anything for now but may be useful for future API updates. @@ -52,14 +53,14 @@ class ApplyRule: When ``to_return`` is not None (e.g. ``to_return=MyItem``), the Page Object in ``use`` is declared as capable of returning a certain - item class (``MyItem``). + item class (i.e. ``MyItem``). When ``instead_of`` is not None (e.g. ``instead_of=ReplacedPageObject``), the rule adds an expectation that the ``ReplacedPageObject`` wouldn't be used for the URLs matching ``for_patterns``, since the Page Object in ``use`` will replace it. - If there are multuple rules which match a certain URL, the rule + If there are multiple rules which match a certain URL, the rule to apply is picked based on the priorities set in ``for_patterns``. More information regarding its usage in :ref:`intro-overrides`. From 3e23c514bdc47010f6c274a0d1994fa160a15b6e Mon Sep 17 00:00:00 2001 From: Mikhail Korobov <kmike84@gmail.com> Date: Thu, 27 Oct 2022 11:21:07 +0500 Subject: [PATCH 46/46] mention str -> Patterns conversion --- docs/intro/overrides.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/intro/overrides.rst b/docs/intro/overrides.rst index eb742e87..8431ef95 100644 --- a/docs/intro/overrides.rst +++ b/docs/intro/overrides.rst @@ -69,10 +69,12 @@ URL rules. Some examples: # it contains "productId=..." in the query string @handle_urls("example.com?productId=*") -Please consult with the url-matcher_ documentation for more; it is pretty -flexible. It is possible to exclude patterns, use wildcards, require certain -query parameters to be present and ignore others, etc.; -unlike regexes, this mini-language "understands" the URL structure. +The string passed to :func:`~.handle_urls` is converted to +a :class:`url_matcher.matcher.Patterns` instance. Please consult +with the url-matcher_ documentation to learn more about the possible rules; +it is pretty flexible. You can exclude patterns, use wildcards, +require certain query parameters to be present and ignore others, etc. +Unlike regexes, this mini-language "understands" the URL structure. .. _url-matcher: https://url-matcher.readthedocs.io