From f4d2f408def4270583ac6ee76e7619bfadfba4bc Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 6 Jan 2023 15:10:43 +0800 Subject: [PATCH 01/16] move some registry functionalities from scrapy-poet --- tests/test_rules.py | 98 ++++++++++++++++++++++++++++++++++++++++++--- web_poet/rules.py | 84 ++++++++++++++++++++++++++++++++++---- 2 files changed, 170 insertions(+), 12 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index 6092cc2f..066ad94c 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -1,3 +1,5 @@ +import warnings + import attrs import pytest from url_matcher import Patterns @@ -9,8 +11,11 @@ POTopLevelOverriden2, ) from tests.po_lib.a_module import POModule, POModuleOverriden -from tests.po_lib.nested_package import PONestedPkg -from tests.po_lib.nested_package.a_nested_module import PONestedModule +from tests.po_lib.nested_package import PONestedPkg, PONestedPkgOverriden +from tests.po_lib.nested_package.a_nested_module import ( + PONestedModule, + PONestedModuleOverriden, +) from tests.po_lib_sub import POLibSub from tests.po_lib_to_return import ( CustomProductPage, @@ -20,7 +25,10 @@ LessProductPage, MoreProductPage, Product, + ProductFewerFields, + ProductMoreFields, ProductPage, + ProductSeparate, ProductSimilar, SeparateProductPage, SimilarProductPage, @@ -292,7 +300,7 @@ def test_registry_search_overrides_deprecation() -> None: def test_init_rules() -> None: rules = ( ApplyRule( - for_patterns=Patterns(include=["sample.com"]), + for_patterns=Patterns(include=["example.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden2, ), @@ -305,10 +313,90 @@ def test_init_rules() -> None: assert default_registry.get_rules() != rules +def test_add_rule() -> None: + registry = RulesRegistry() + + # Basic case of adding a rule + rule_1 = ApplyRule( + for_patterns=Patterns(include=["example.com"]), + use=POTopLevel1, + instead_of=POTopLevelOverriden1, + to_return=Product, + ) + registry.add_rule(rule_1) + assert registry.get_rules() == [rule_1] + + # Adding a second rule should not emit a warning as long as both the URL + # pattern and `.to_return` value is not the same. + rule_2 = ApplyRule( + for_patterns=Patterns(include=["example.com"]), + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + to_return=ProductSimilar, + ) + with warnings.catch_warnings(record=True) as warnings_emitted: + registry.add_rule(rule_2) + assert not warnings_emitted + assert registry.get_rules() == [rule_1, rule_2] + + # Warnings should be raised for this case since it's the same URL pattern + # and `.to_return` value from one of the past rules. + rule_3 = ApplyRule( + for_patterns=Patterns(include=["example.com"]), + use=POTopLevel1, + instead_of=POTopLevelOverriden2, + to_return=Product, + ) + expected_msg = ( + r"Similar URL patterns .+? were declared earlier that use " + r"to_return=." + ) + with pytest.warns(UserWarning, match=expected_msg): + registry.add_rule(rule_3) + assert registry.get_rules() == [rule_1, rule_2, rule_3] + + +def test_overrides_for() -> None: + # FIXME: Flaky + + assert default_registry.overrides_for("https://example.com") == { + POTopLevelOverriden1: POTopLevel1, + POTopLevelOverriden2: POTopLevel2, + POModuleOverriden: POModule, + PONestedPkgOverriden: PONestedPkg, + PONestedModuleOverriden: PONestedModule, + ProductPage: CustomProductPageNoReturns, + } + + assert default_registry.overrides_for("https://example.org") == { + PONestedModuleOverriden: PONestedModule, + PONestedPkgOverriden: PONestedPkg, + } + + +def test_page_object_for() -> None: + # FIXME: Flaky + + assert default_registry.page_object_for("https://example.com") == { + ProductSimilar: SimilarProductPage, + Product: CustomProductPageDataTypeOnly, + ProductSeparate: SeparateProductPage, + ProductFewerFields: LessProductPage, + ProductMoreFields: MoreProductPage, + } + + assert not default_registry.page_object_for("https://example.org") + + +# TODO: test for updating the hash + +# TODO: Add the other new tests from scrapy-poet + + def test_from_override_rules_deprecation_using_ApplyRule() -> None: rules = [ ApplyRule( - for_patterns=Patterns(include=["sample.com"]), + for_patterns=Patterns(include=["example.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden2, ) @@ -325,7 +413,7 @@ def test_from_override_rules_deprecation_using_ApplyRule() -> None: def test_from_override_rules_deprecation_using_OverrideRule() -> None: rules = [ OverrideRule( - for_patterns=Patterns(include=["sample.com"]), + for_patterns=Patterns(include=["example.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden2, ) diff --git a/web_poet/rules.py b/web_poet/rules.py index 7fe42088..41aed8d5 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -4,14 +4,15 @@ import importlib.util import pkgutil import warnings -from collections import deque +from collections import defaultdict, deque from operator import attrgetter -from typing import Any, Dict, Iterable, List, Optional, Type, TypeVar, Union +from typing import Any, Dict, Iterable, List, Mapping, Optional, Type, TypeVar, Union import attrs -from url_matcher import Patterns +from url_matcher import Patterns, URLMatcher from web_poet._typing import get_item_cls +from web_poet.page_inputs.url import _Url from web_poet.pages import ItemPage from web_poet.utils import _create_deprecated_class, as_list, str_to_pattern @@ -115,9 +116,52 @@ class ExampleComProductPage(WebPage[Product]): """ def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): - self._rules: List[ApplyRule] = [] + self._rules: Dict[int, ApplyRule] = {} + self.overrides_matcher: Dict[Type[ItemPage], URLMatcher] = defaultdict( + URLMatcher + ) + self.item_matcher: Dict[Optional[Type], URLMatcher] = defaultdict(URLMatcher) + if rules is not None: - self._rules = list(rules) + for rule in rules: + self.add_rule(rule) + + def add_rule(self, rule: ApplyRule) -> None: + """Registers an :class:`web_poet.rules.ApplyRule` instance.""" + rule_id = hash(rule) + + # A common case when a page object subclasses another one with the same + # URL pattern. + matched = self.item_matcher.get(rule.to_return) + if matched: + pattern_dupes = [ + pattern + for pattern in matched.patterns.values() + if pattern == rule.for_patterns + ] + if pattern_dupes: + rules = [ + r + for p in pattern_dupes + for r in self.search(for_patterns=p, to_return=rule.to_return) + ] + warnings.warn( + f"Similar URL patterns {pattern_dupes} were declared earlier " + f"that use to_return={rule.to_return}. The first, highest-priority " + f"rule added to SCRAPY_POET_REGISTRY will be used when matching " + f"against URLs. Consider updating the priority of these rules: " + f"{rules}." + ) + + if rule.instead_of: + self.overrides_matcher[rule.instead_of].add_or_update( + rule_id, rule.for_patterns + ) + if rule.to_return: + self.item_matcher[rule.to_return].add_or_update(rule_id, rule.for_patterns) + + # TODO: test removing the rule + self._rules[rule_id] = rule @classmethod def from_override_rules( @@ -199,7 +243,7 @@ def wrapper(cls): to_return=to_return or get_item_cls(cls), meta=kwargs, ) - self._rules.append(rule) + self.add_rule(rule) return cls return wrapper @@ -214,7 +258,7 @@ def get_rules(self) -> List[ApplyRule]: beforehand to recursively import all submodules which contains the ``@handle_urls`` decorators from external Page Objects. """ - return self._rules[:] + return list(self._rules.values()) def get_overrides(self) -> List[ApplyRule]: """Deprecated, use :meth:`~.RulesRegistry.get_rules` instead.""" @@ -259,6 +303,32 @@ def search_overrides(self, **kwargs) -> List[ApplyRule]: warnings.warn(msg, DeprecationWarning, stacklevel=2) return self.search(**kwargs) + def _rules_for_url( + self, url: Union[_Url, str], url_matcher: Dict[Any, URLMatcher] + ) -> Mapping[Type, Type[ItemPage]]: + result: Dict[Type, Type[ItemPage]] = {} + + for target, matcher in url_matcher.items(): + rule_id = matcher.match(url) + if rule_id is not None: + result[target] = self._rules[rule_id].use + + return result + + def overrides_for(self, url: Union[_Url, str]) -> Mapping[Type, Type[ItemPage]]: + """Finds all of the page objects associated with the given URL and + returns a Mapping where the 'key' represents the page object that is + **overridden** by the page object in 'value'. + """ + return self._rules_for_url(url, self.overrides_matcher) + + def page_object_for(self, url: Union[_Url, str]) -> Mapping[Type, Type[ItemPage]]: + """Finds all of the page objects associated with the given URL and + returns a Mapping where the 'key' represents the item class that is + **produced** by the page object in 'value'. + """ + return self._rules_for_url(url, self.item_matcher) + def _walk_module(module: str) -> Iterable: """Return all modules from a module recursively. From f2f7177c81b1e9bfdc86de01289755d062d49ede Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 6 Jan 2023 16:37:42 +0800 Subject: [PATCH 02/16] fix non-deterministic matching when URL patterns are tied --- tests/test_rules.py | 6 +++--- web_poet/rules.py | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index 066ad94c..df2ae7d7 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -357,7 +357,7 @@ def test_add_rule() -> None: def test_overrides_for() -> None: - # FIXME: Flaky + # TODO: Test with RequestUrl/ResponseUrl as well assert default_registry.overrides_for("https://example.com") == { POTopLevelOverriden1: POTopLevel1, @@ -375,10 +375,10 @@ def test_overrides_for() -> None: def test_page_object_for() -> None: - # FIXME: Flaky + # TODO: Test with RequestUrl/ResponseUrl as well assert default_registry.page_object_for("https://example.com") == { - ProductSimilar: SimilarProductPage, + ProductSimilar: CustomProductPageNoReturns, Product: CustomProductPageDataTypeOnly, ProductSeparate: SeparateProductPage, ProductFewerFields: LessProductPage, diff --git a/web_poet/rules.py b/web_poet/rules.py index 41aed8d5..4153522c 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -122,14 +122,23 @@ def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): ) self.item_matcher: Dict[Optional[Type], URLMatcher] = defaultdict(URLMatcher) + # Ensures that URLMatcher is deterministic in returning a rule when + # matching. Currently, `URLMatcher._sort_domain` has this sorting + # criteria: + # * Priority (descending) + # * Sorted list of includes for this domain (descending) + # * Rule identifier (descending) + # This means that if the priority and domain are the same, the last tie + # breaker woulud be the "Rule identifier", this means we can base it on + # when a given rule was added to the registry, i.e. a counter. + self._rule_counter = 0 + if rules is not None: for rule in rules: self.add_rule(rule) def add_rule(self, rule: ApplyRule) -> None: """Registers an :class:`web_poet.rules.ApplyRule` instance.""" - rule_id = hash(rule) - # A common case when a page object subclasses another one with the same # URL pattern. matched = self.item_matcher.get(rule.to_return) @@ -153,6 +162,9 @@ def add_rule(self, rule: ApplyRule) -> None: f"{rules}." ) + self._rule_counter += 1 + rule_id = self._rule_counter + if rule.instead_of: self.overrides_matcher[rule.instead_of].add_or_update( rule_id, rule.for_patterns From 3891a04fd8e8c579775b82eefed5a076e7fb094c Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 6 Jan 2023 16:46:02 +0800 Subject: [PATCH 03/16] ensure RequestUrl/ResponseUrl works --- tests/test_rules.py | 56 ++++++++++++++++++++------------------------- web_poet/rules.py | 2 ++ 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index df2ae7d7..5ec76ea5 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -42,6 +42,7 @@ default_registry, handle_urls, ) +from web_poet.page_inputs.url import RequestUrl, ResponseUrl POS = { CustomProductPage, @@ -357,40 +358,33 @@ def test_add_rule() -> None: def test_overrides_for() -> None: - # TODO: Test with RequestUrl/ResponseUrl as well - - assert default_registry.overrides_for("https://example.com") == { - POTopLevelOverriden1: POTopLevel1, - POTopLevelOverriden2: POTopLevel2, - POModuleOverriden: POModule, - PONestedPkgOverriden: PONestedPkg, - PONestedModuleOverriden: PONestedModule, - ProductPage: CustomProductPageNoReturns, - } - - assert default_registry.overrides_for("https://example.org") == { - PONestedModuleOverriden: PONestedModule, - PONestedPkgOverriden: PONestedPkg, - } + for cls in [str, RequestUrl, ResponseUrl]: + assert default_registry.overrides_for(cls("https://example.com")) == { + POTopLevelOverriden1: POTopLevel1, + POTopLevelOverriden2: POTopLevel2, + POModuleOverriden: POModule, + PONestedPkgOverriden: PONestedPkg, + PONestedModuleOverriden: PONestedModule, + ProductPage: CustomProductPageNoReturns, + } + + assert default_registry.overrides_for(cls("https://example.org")) == { + PONestedModuleOverriden: PONestedModule, + PONestedPkgOverriden: PONestedPkg, + } def test_page_object_for() -> None: - # TODO: Test with RequestUrl/ResponseUrl as well - - assert default_registry.page_object_for("https://example.com") == { - ProductSimilar: CustomProductPageNoReturns, - Product: CustomProductPageDataTypeOnly, - ProductSeparate: SeparateProductPage, - ProductFewerFields: LessProductPage, - ProductMoreFields: MoreProductPage, - } - - assert not default_registry.page_object_for("https://example.org") - - -# TODO: test for updating the hash - -# TODO: Add the other new tests from scrapy-poet + for cls in [str, RequestUrl, ResponseUrl]: + assert default_registry.page_object_for(cls("https://example.com")) == { + ProductSimilar: CustomProductPageNoReturns, + Product: CustomProductPageDataTypeOnly, + ProductSeparate: SeparateProductPage, + ProductFewerFields: LessProductPage, + ProductMoreFields: MoreProductPage, + } + + assert not default_registry.page_object_for(cls("https://example.org")) def test_from_override_rules_deprecation_using_ApplyRule() -> None: diff --git a/web_poet/rules.py b/web_poet/rules.py index 4153522c..3c4af98a 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -320,6 +320,8 @@ def _rules_for_url( ) -> Mapping[Type, Type[ItemPage]]: result: Dict[Type, Type[ItemPage]] = {} + url = str(url) + for target, matcher in url_matcher.items(): rule_id = matcher.match(url) if rule_id is not None: From 513235de81052b7cbc7339f6cca6e1e3d5a85204 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 6 Jan 2023 16:52:14 +0800 Subject: [PATCH 04/16] rename variables for clarity --- web_poet/rules.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index 3c4af98a..9fd1f08b 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -117,10 +117,10 @@ class ExampleComProductPage(WebPage[Product]): def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): self._rules: Dict[int, ApplyRule] = {} - self.overrides_matcher: Dict[Type[ItemPage], URLMatcher] = defaultdict( + self._overrides_matchers: Dict[Type[ItemPage], URLMatcher] = defaultdict( URLMatcher ) - self.item_matcher: Dict[Optional[Type], URLMatcher] = defaultdict(URLMatcher) + self._item_matchers: Dict[Optional[Type], URLMatcher] = defaultdict(URLMatcher) # Ensures that URLMatcher is deterministic in returning a rule when # matching. Currently, `URLMatcher._sort_domain` has this sorting @@ -141,7 +141,7 @@ def add_rule(self, rule: ApplyRule) -> None: """Registers an :class:`web_poet.rules.ApplyRule` instance.""" # A common case when a page object subclasses another one with the same # URL pattern. - matched = self.item_matcher.get(rule.to_return) + matched = self._item_matchers.get(rule.to_return) if matched: pattern_dupes = [ pattern @@ -166,11 +166,13 @@ def add_rule(self, rule: ApplyRule) -> None: rule_id = self._rule_counter if rule.instead_of: - self.overrides_matcher[rule.instead_of].add_or_update( + self._overrides_matchers[rule.instead_of].add_or_update( rule_id, rule.for_patterns ) if rule.to_return: - self.item_matcher[rule.to_return].add_or_update(rule_id, rule.for_patterns) + self._item_matchers[rule.to_return].add_or_update( + rule_id, rule.for_patterns + ) # TODO: test removing the rule self._rules[rule_id] = rule @@ -316,13 +318,13 @@ def search_overrides(self, **kwargs) -> List[ApplyRule]: return self.search(**kwargs) def _rules_for_url( - self, url: Union[_Url, str], url_matcher: Dict[Any, URLMatcher] + self, url: Union[_Url, str], matchers: Dict[Any, URLMatcher] ) -> Mapping[Type, Type[ItemPage]]: result: Dict[Type, Type[ItemPage]] = {} url = str(url) - for target, matcher in url_matcher.items(): + for target, matcher in matchers.items(): rule_id = matcher.match(url) if rule_id is not None: result[target] = self._rules[rule_id].use @@ -334,14 +336,14 @@ def overrides_for(self, url: Union[_Url, str]) -> Mapping[Type, Type[ItemPage]]: returns a Mapping where the 'key' represents the page object that is **overridden** by the page object in 'value'. """ - return self._rules_for_url(url, self.overrides_matcher) + return self._rules_for_url(url, self._overrides_matchers) def page_object_for(self, url: Union[_Url, str]) -> Mapping[Type, Type[ItemPage]]: """Finds all of the page objects associated with the given URL and returns a Mapping where the 'key' represents the item class that is **produced** by the page object in 'value'. """ - return self._rules_for_url(url, self.item_matcher) + return self._rules_for_url(url, self._item_matchers) def _walk_module(module: str) -> Iterable: From c53249cfb8372dd520e26cb5b40f72dfa2aa0080 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 9 Jan 2023 16:54:56 +0800 Subject: [PATCH 05/16] fix typing and comments --- web_poet/rules.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index 9fd1f08b..b2348538 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -123,14 +123,14 @@ def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): self._item_matchers: Dict[Optional[Type], URLMatcher] = defaultdict(URLMatcher) # Ensures that URLMatcher is deterministic in returning a rule when - # matching. Currently, `URLMatcher._sort_domain` has this sorting - # criteria: + # matching. As of url_macher==0.2.0, `url_matcher.URLMatcher._sort_domain` + # has this sorting criteria: # * Priority (descending) # * Sorted list of includes for this domain (descending) # * Rule identifier (descending) # This means that if the priority and domain are the same, the last tie - # breaker woulud be the "Rule identifier", this means we can base it on - # when a given rule was added to the registry, i.e. a counter. + # breaker would be the "Rule identifier", this means we can base it on + # the order of rule addition to the registry, i.e. a counter. self._rule_counter = 0 if rules is not None: @@ -331,7 +331,9 @@ def _rules_for_url( return result - def overrides_for(self, url: Union[_Url, str]) -> Mapping[Type, Type[ItemPage]]: + def overrides_for( + self, url: Union[_Url, str] + ) -> Mapping[Type[ItemPage], Type[ItemPage]]: """Finds all of the page objects associated with the given URL and returns a Mapping where the 'key' represents the page object that is **overridden** by the page object in 'value'. From a59bd21ab56c3dc16c2987af4cddab3e766d096f Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 9 Jan 2023 17:02:54 +0800 Subject: [PATCH 06/16] create page_object_for_item() registry method --- tests/test_rules.py | 22 ++++++++++++++++++++++ web_poet/rules.py | 17 +++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/tests/test_rules.py b/tests/test_rules.py index 5ec76ea5..b2ee181a 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -387,6 +387,28 @@ def test_page_object_for() -> None: assert not default_registry.page_object_for(cls("https://example.org")) +def test_page_object_for_item() -> None: + # This is not associated with any rule. + class FakeItem: + pass + + method = default_registry.page_object_for_item + + for cls in [str, RequestUrl, ResponseUrl]: + url = cls("https://example.com") + assert method(url, ProductSimilar) == CustomProductPageNoReturns + assert method(url, Product) == CustomProductPageDataTypeOnly + assert method(url, ProductSeparate) == SeparateProductPage + assert method(url, ProductFewerFields) == LessProductPage + assert method(url, ProductMoreFields) == MoreProductPage + + # When there's no rule specifying to return this FakeItem + assert method(url, FakeItem) is None + + # When the URL itself doesn't have any ``to_return`` in any of its rules + assert method(cls("https://example.org"), FakeItem) is None + + def test_from_override_rules_deprecation_using_ApplyRule() -> None: rules = [ ApplyRule( diff --git a/web_poet/rules.py b/web_poet/rules.py index b2348538..a0e52370 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -347,6 +347,23 @@ def page_object_for(self, url: Union[_Url, str]) -> Mapping[Type, Type[ItemPage] """ return self._rules_for_url(url, self._item_matchers) + def page_object_for_item( + self, url: Union[_Url, str], item_cls: Type + ) -> Optional[Type]: + """Return the page object class associated with the given URL that's able + to produce the given ``item_cls``. + """ + + url = str(url) + + matcher = self._item_matchers.get(item_cls) + if matcher: + rule_id = matcher.match(url) + if rule_id is not None: + return self._rules[rule_id].use + + return None + def _walk_module(module: str) -> Iterable: """Return all modules from a module recursively. From a5e8480f9d20bf65f6ecc8b0d6e56e666abb1223 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 9 Jan 2023 17:30:12 +0800 Subject: [PATCH 07/16] drop page_object_for() registry method https://github.com/scrapinghub/web-poet/pull/112#discussion_r1063367500 --- tests/test_rules.py | 13 ------------- web_poet/rules.py | 7 ------- 2 files changed, 20 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index b2ee181a..1081c373 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -374,19 +374,6 @@ def test_overrides_for() -> None: } -def test_page_object_for() -> None: - for cls in [str, RequestUrl, ResponseUrl]: - assert default_registry.page_object_for(cls("https://example.com")) == { - ProductSimilar: CustomProductPageNoReturns, - Product: CustomProductPageDataTypeOnly, - ProductSeparate: SeparateProductPage, - ProductFewerFields: LessProductPage, - ProductMoreFields: MoreProductPage, - } - - assert not default_registry.page_object_for(cls("https://example.org")) - - def test_page_object_for_item() -> None: # This is not associated with any rule. class FakeItem: diff --git a/web_poet/rules.py b/web_poet/rules.py index a0e52370..4373ae8a 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -340,13 +340,6 @@ def overrides_for( """ return self._rules_for_url(url, self._overrides_matchers) - def page_object_for(self, url: Union[_Url, str]) -> Mapping[Type, Type[ItemPage]]: - """Finds all of the page objects associated with the given URL and - returns a Mapping where the 'key' represents the item class that is - **produced** by the page object in 'value'. - """ - return self._rules_for_url(url, self._item_matchers) - def page_object_for_item( self, url: Union[_Url, str], item_cls: Type ) -> Optional[Type]: From ef520fac1bae02b47ee829578c2f66005fc8583b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 9 Jan 2023 18:19:18 +0800 Subject: [PATCH 08/16] create docs for overrides_for() and page_object_for_item() --- CHANGELOG.rst | 9 +++++++ docs/page-objects/rules.rst | 52 +++++++++++++++++++++++++++++++++++++ web_poet/rules.py | 4 +++ 3 files changed, 65 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d39b3ac1..49375b7f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,15 @@ Changelog ========= +TBR +--- + +* Introduce new methods for :class:`web_poet.rules.RulesRegistry`: + + * :meth:`web_poet.rules.RulesRegistry.add_rule` + * :meth:`web_poet.rules.RulesRegistry.overrides_for` + * :meth:`web_poet.rules.RulesRegistry.page_object_for_item` + 0.6.0 (2022-11-08) ------------------ diff --git a/docs/page-objects/rules.rst b/docs/page-objects/rules.rst index d19783ed..c9572143 100644 --- a/docs/page-objects/rules.rst +++ b/docs/page-objects/rules.rst @@ -390,6 +390,58 @@ in the :class:`~.ApplyRule` to be written into ``web_poet.default_registry``. The next section explores this caveat further. +Using URLs against the registered rules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the important aspects of :class:`~.ApplyRule` is dictating which URLs it's +able to work using its ``for_patterns`` attribute. There are a few methods +available in :class:`~.RulesRegistry` which accepts a URL value (:class:`str`, +:class:`~.RequestUrl`, or :class:`~.ResponseUrl`) to find specific information +from the registered rules. + +.. _rules-overrides_for-example: + +Find the page object overrides +"""""""""""""""""""""""""""""" + +Suppose you want to see what are the :ref:`rules-intro-overrides` that are +available from a given webpage, you can use :meth:`~.RulesRegistry.overrides_for` +by passing the webpage URL. For example: + +.. code-block:: python + + from web_poet import default_registry + + overrides = default_registry.overrides_for("http://books.toscrape.com/") + print(overrides) + + # { + # : , + # : , + # } + +It returns a :class:`Mapping` where the *key* represents the page object class +that is overridden or replaced by the page object class in the *value*. + +.. _rules-page_object_for_item-example: + +Identify the page object that could create the item +""""""""""""""""""""""""""""""""""""""""""""""""""" + +Suppose you want to retrieve the page object class that is able to create the +item class that you want from a given webpage, you can use +:meth:`~.RulesRegistry.page_object_for_item`. For example: + +.. code-block:: python + + from web_poet import default_registry + + item_cls = default_registry.page_object_for_item( + "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html", + Book + ) + print(item_cls) # BookPage + Using rules from External Packages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/web_poet/rules.py b/web_poet/rules.py index 4373ae8a..1d167350 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -337,6 +337,8 @@ def overrides_for( """Finds all of the page objects associated with the given URL and returns a Mapping where the 'key' represents the page object that is **overridden** by the page object in 'value'. + + See example: :ref:`rules-overrides_for-example`. """ return self._rules_for_url(url, self._overrides_matchers) @@ -345,6 +347,8 @@ def page_object_for_item( ) -> Optional[Type]: """Return the page object class associated with the given URL that's able to produce the given ``item_cls``. + + See example: :ref:`rules-page_object_for_item-example`. """ url = str(url) From 37fb650147dd2d7e751f8644592a7794055b14ef Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 9 Jan 2023 21:00:36 +0800 Subject: [PATCH 09/16] fix UserWarning for similar rules --- tests/test_rules.py | 13 +++++++------ web_poet/rules.py | 9 ++++----- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index 1081c373..fbcae98e 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -163,7 +163,7 @@ def test_apply_rule_kwargs_only() -> None: ApplyRule( "example.com", *[params[r] for r in remove], - **{k: v for k, v in params.items() if k not in remove} # type: ignore[arg-type] + **{k: v for k, v in params.items() if k not in remove}, # type: ignore[arg-type] ) @@ -348,12 +348,13 @@ def test_add_rule() -> None: instead_of=POTopLevelOverriden2, to_return=Product, ) - expected_msg = ( - r"Similar URL patterns .+? were declared earlier that use " - r"to_return=." - ) - with pytest.warns(UserWarning, match=expected_msg): + # Since we're using f-strings to compare the warning emitted, don't use + # ``pytest.warns()`` here since it treats the msg as regex which translates + # the "(" and ")" characters differently from the expected message. + with warnings.catch_warnings(record=True) as warnings_emitted: registry.add_rule(rule_3) + expected_msg = f"Consider updating the priority of these rules: {[rule_1, rule_3]}." + assert any([True for w in warnings_emitted if expected_msg in str(w.message)]) assert registry.get_rules() == [rule_1, rule_2, rule_3] diff --git a/web_poet/rules.py b/web_poet/rules.py index 1d167350..4e62eec0 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -149,17 +149,17 @@ def add_rule(self, rule: ApplyRule) -> None: if pattern == rule.for_patterns ] if pattern_dupes: - rules = [ + rules_to_warn = [ r for p in pattern_dupes for r in self.search(for_patterns=p, to_return=rule.to_return) - ] + ] + [rule] warnings.warn( f"Similar URL patterns {pattern_dupes} were declared earlier " f"that use to_return={rule.to_return}. The first, highest-priority " - f"rule added to SCRAPY_POET_REGISTRY will be used when matching " + f"rule added to the registry will be used when matching " f"against URLs. Consider updating the priority of these rules: " - f"{rules}." + f"{rules_to_warn}." ) self._rule_counter += 1 @@ -174,7 +174,6 @@ def add_rule(self, rule: ApplyRule) -> None: rule_id, rule.for_patterns ) - # TODO: test removing the rule self._rules[rule_id] = rule @classmethod From 299f1652521a8e8da956133250cc2501dc552918 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 10 Jan 2023 16:04:07 +0800 Subject: [PATCH 10/16] refactor overrides_for() and page_object_for_item() --- web_poet/rules.py | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index 4e62eec0..11b84fb0 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -316,19 +316,16 @@ def search_overrides(self, **kwargs) -> List[ApplyRule]: warnings.warn(msg, DeprecationWarning, stacklevel=2) return self.search(**kwargs) - def _rules_for_url( - self, url: Union[_Url, str], matchers: Dict[Any, URLMatcher] - ) -> Mapping[Type, Type[ItemPage]]: - result: Dict[Type, Type[ItemPage]] = {} + def _match_url_for_po( + self, url: Union[_Url, str], matcher: Optional[URLMatcher] = None + ) -> Optional[Type[ItemPage]]: + """Returns the page object to use based on the URL and URLMatcher.""" + if not url or matcher is None: + return None - url = str(url) - - for target, matcher in matchers.items(): - rule_id = matcher.match(url) - if rule_id is not None: - result[target] = self._rules[rule_id].use - - return result + rule_id = matcher.match(str(url)) + if rule_id is not None: + return self._rules[rule_id].use def overrides_for( self, url: Union[_Url, str] @@ -339,7 +336,12 @@ def overrides_for( See example: :ref:`rules-overrides_for-example`. """ - return self._rules_for_url(url, self._overrides_matchers) + result: Dict[Type[ItemPage], Type[ItemPage]] = {} + for target, matcher in self._overrides_matchers.items(): + po = self._match_url_for_po(url, matcher) + if po: + result[target] = po + return result def page_object_for_item( self, url: Union[_Url, str], item_cls: Type @@ -349,16 +351,8 @@ def page_object_for_item( See example: :ref:`rules-page_object_for_item-example`. """ - - url = str(url) - matcher = self._item_matchers.get(item_cls) - if matcher: - rule_id = matcher.match(url) - if rule_id is not None: - return self._rules[rule_id].use - - return None + return self._match_url_for_po(url, matcher) def _walk_module(module: str) -> Iterable: From 05a832b82beae4242a3facc1920992cee91e00d2 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 10 Jan 2023 21:10:43 +0800 Subject: [PATCH 11/16] optimize RulesRegistry.search() --- CHANGELOG.rst | 6 ++++++ tests/test_rules.py | 12 ++++++------ web_poet/rules.py | 38 +++++++++++++++++++++++++++++++------- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 49375b7f..b343f8ae 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,12 @@ TBR * :meth:`web_poet.rules.RulesRegistry.overrides_for` * :meth:`web_poet.rules.RulesRegistry.page_object_for_item` +* Improved the performance of :meth:`web_poet.rules.RulesRegistry.search` where + passing a single parameter of either ``instead_of`` or ``to_return`` results + in *O(1)* look-up time instead of *O(N)*. Additionally, having either + ``instead_of`` or ``to_return`` present in multi-parameter search calls would + filter the initial candidate results resulting in a faster search. + 0.6.0 (2022-11-08) ------------------ diff --git a/tests/test_rules.py b/tests/test_rules.py index fbcae98e..3a9d0ca3 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -257,18 +257,18 @@ def test_registry_search() -> None: # param: to_return rules = default_registry.search(to_return=Product) assert rules == [ - ApplyRule("example.com", use=ProductPage, to_return=Product), ApplyRule( "example.com", - use=ImprovedProductPage, - instead_of=ProductPage, + # mypy complains here since it's expecting a container class when + # declared, i.e, ``ItemPage[SomeItem]`` + use=CustomProductPageDataTypeOnly, # type: ignore[arg-type] to_return=Product, ), + ApplyRule("example.com", use=ProductPage, to_return=Product), ApplyRule( "example.com", - # mypy complains here since it's expecting a container class when - # declared, i.e, ``ItemPage[SomeItem]`` - use=CustomProductPageDataTypeOnly, # type: ignore[arg-type] + use=ImprovedProductPage, + instead_of=ProductPage, to_return=Product, ), ] diff --git a/web_poet/rules.py b/web_poet/rules.py index 11b84fb0..53458324 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -120,7 +120,7 @@ def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): self._overrides_matchers: Dict[Type[ItemPage], URLMatcher] = defaultdict( URLMatcher ) - self._item_matchers: Dict[Optional[Type], URLMatcher] = defaultdict(URLMatcher) + self._item_matchers: Dict[Type, URLMatcher] = defaultdict(URLMatcher) # Ensures that URLMatcher is deterministic in returning a rule when # matching. As of url_macher==0.2.0, `url_matcher.URLMatcher._sort_domain` @@ -139,10 +139,12 @@ def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): def add_rule(self, rule: ApplyRule) -> None: """Registers an :class:`web_poet.rules.ApplyRule` instance.""" - # A common case when a page object subclasses another one with the same - # URL pattern. - matched = self._item_matchers.get(rule.to_return) + + # Ignore the type since ``ApplyRule`` could have an empty ``.to_return`` + matched = self._item_matchers.get(rule.to_return) # type: ignore[arg-type] if matched: + # A common case when a page object subclasses another one with the + # same URL pattern. pattern_dupes = [ pattern for pattern in matched.patterns.values() @@ -293,10 +295,32 @@ def search(self, **kwargs) -> List[ApplyRule]: print(rules[0].instead_of) # GenericPO """ + rule_ids = set() + + # Ignore the types since we're using None value as a quick look-up. + + matcher = self._item_matchers.get(kwargs.get("to_return")) # type: ignore[arg-type] + if matcher: + rule_ids.update(matcher.patterns.keys()) + + matcher = self._overrides_matchers.get(kwargs.get("instead_of")) # type: ignore[arg-type] + if matcher: + if rule_ids: + # If both params are used then narrow down the rules. + rule_ids.union(matcher.patterns.keys()) + else: + rule_ids.update(matcher.patterns.keys()) + + rules = [self._rules[id_] for id_ in rule_ids] + + if len(kwargs) == 1 and rules: + return rules + + # Search other parameters as well getter = attrgetter(*kwargs.keys()) - def matcher(rule: ApplyRule): + def finder(rule: ApplyRule): attribs = getter(rule) if not isinstance(attribs, tuple): attribs = (attribs,) @@ -305,8 +329,8 @@ def matcher(rule: ApplyRule): return False results = [] - for rule in self.get_rules(): - if matcher(rule): + for rule in rules or self.get_rules(): + if finder(rule): results.append(rule) return results From dc6181024758aa0c1038a70b03a6f529f479c327 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 13 Jan 2023 15:09:52 +0800 Subject: [PATCH 12/16] further optimize RulesRegistry.search() --- tests/test_rules.py | 41 +++++++++++++++++++++++----- web_poet/rules.py | 65 ++++++++++++++++++++++++--------------------- 2 files changed, 69 insertions(+), 37 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index 3a9d0ca3..0e6ca96f 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -254,31 +254,57 @@ def test_registry_search() -> None: assert len(rules) == 1 assert rules[0].instead_of == POTopLevelOverriden2 + rules = default_registry.search(instead_of=None) + for rule in rules: + assert rule.instead_of is None + # param: to_return rules = default_registry.search(to_return=Product) assert rules == [ + ApplyRule("example.com", use=ProductPage, to_return=Product), ApplyRule( "example.com", - # mypy complains here since it's expecting a container class when - # declared, i.e, ``ItemPage[SomeItem]`` - use=CustomProductPageDataTypeOnly, # type: ignore[arg-type] + use=ImprovedProductPage, + instead_of=ProductPage, to_return=Product, ), - ApplyRule("example.com", use=ProductPage, to_return=Product), ApplyRule( "example.com", - use=ImprovedProductPage, - instead_of=ProductPage, + # mypy complains here since it's expecting a container class when + # declared, i.e, ``ItemPage[SomeItem]`` + use=CustomProductPageDataTypeOnly, # type: ignore[arg-type] to_return=Product, ), ] + rules = default_registry.search(to_return=None) + for rule in rules: + assert rule.to_return is None + # params: to_return and use rules = default_registry.search(to_return=Product, use=ImprovedProductPage) assert len(rules) == 1 assert rules[0].to_return == Product assert rules[0].use == ImprovedProductPage + # params: to_return and instead_of + rules = default_registry.search(to_return=Product, instead_of=None) + assert len(rules) == 2 + assert rules[0].to_return == Product + assert rules[0].instead_of is None + assert rules[1].to_return == Product + assert rules[1].instead_of is None + + rules = default_registry.search(to_return=None, instead_of=ProductPage) + for rule in rules: + assert rule.to_return is None + assert rule.instead_of is None + + rules = default_registry.search(to_return=None, instead_of=None) + assert len(rules) == 1 + assert rules[0].to_return is None + assert rules[0].instead_of is None + # Such rules doesn't exist rules = default_registry.search(use=POModuleOverriden) assert len(rules) == 0 @@ -390,6 +416,9 @@ class FakeItem: assert method(url, ProductFewerFields) == LessProductPage assert method(url, ProductMoreFields) == MoreProductPage + # Type is ignored since item_cls shouldn't be None + assert method(url, None) is None # type: ignore[arg-type] + # When there's no rule specifying to return this FakeItem assert method(url, FakeItem) is None diff --git a/web_poet/rules.py b/web_poet/rules.py index 53458324..caa85062 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -117,10 +117,10 @@ class ExampleComProductPage(WebPage[Product]): def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): self._rules: Dict[int, ApplyRule] = {} - self._overrides_matchers: Dict[Type[ItemPage], URLMatcher] = defaultdict( - URLMatcher - ) - self._item_matchers: Dict[Type, URLMatcher] = defaultdict(URLMatcher) + self._overrides_matchers: Dict[ + Optional[Type[ItemPage]], URLMatcher + ] = defaultdict(URLMatcher) + self._item_matchers: Dict[Optional[Type], URLMatcher] = defaultdict(URLMatcher) # Ensures that URLMatcher is deterministic in returning a rule when # matching. As of url_macher==0.2.0, `url_matcher.URLMatcher._sort_domain` @@ -167,14 +167,10 @@ def add_rule(self, rule: ApplyRule) -> None: self._rule_counter += 1 rule_id = self._rule_counter - if rule.instead_of: - self._overrides_matchers[rule.instead_of].add_or_update( - rule_id, rule.for_patterns - ) - if rule.to_return: - self._item_matchers[rule.to_return].add_or_update( - rule_id, rule.for_patterns - ) + self._overrides_matchers[rule.instead_of].add_or_update( + rule_id, rule.for_patterns + ) + self._item_matchers[rule.to_return].add_or_update(rule_id, rule.for_patterns) self._rules[rule_id] = rule @@ -295,25 +291,28 @@ def search(self, **kwargs) -> List[ApplyRule]: print(rules[0].instead_of) # GenericPO """ - rule_ids = set() - - # Ignore the types since we're using None value as a quick look-up. - - matcher = self._item_matchers.get(kwargs.get("to_return")) # type: ignore[arg-type] - if matcher: - rule_ids.update(matcher.patterns.keys()) - - matcher = self._overrides_matchers.get(kwargs.get("instead_of")) # type: ignore[arg-type] - if matcher: - if rule_ids: - # If both params are used then narrow down the rules. - rule_ids.union(matcher.patterns.keys()) - else: - rule_ids.update(matcher.patterns.keys()) + # Use a dict instead of set() to preserve the order. + rule_ids = {} + + if "to_return" in kwargs: + matcher = self._item_matchers.get(kwargs["to_return"]) + if matcher: + rule_ids.update(matcher.patterns) + + if "instead_of" in kwargs: + matcher = self._overrides_matchers.get(kwargs["instead_of"]) + if matcher: + if rule_ids: + # If both params are used then narrow down the rules. + rule_ids = { + k: v for k, v in matcher.patterns.items() if k in rule_ids + } + else: + rule_ids.update(matcher.patterns) rules = [self._rules[id_] for id_ in rule_ids] - if len(kwargs) == 1 and rules: + if rules and kwargs.keys() <= {"to_return", "instead_of"}: return rules # Search other parameters as well @@ -340,7 +339,7 @@ def search_overrides(self, **kwargs) -> List[ApplyRule]: warnings.warn(msg, DeprecationWarning, stacklevel=2) return self.search(**kwargs) - def _match_url_for_po( + def _match_url_for_page_object( self, url: Union[_Url, str], matcher: Optional[URLMatcher] = None ) -> Optional[Type[ItemPage]]: """Returns the page object to use based on the URL and URLMatcher.""" @@ -362,7 +361,9 @@ def overrides_for( """ result: Dict[Type[ItemPage], Type[ItemPage]] = {} for target, matcher in self._overrides_matchers.items(): - po = self._match_url_for_po(url, matcher) + if target is None: + continue + po = self._match_url_for_page_object(url, matcher) if po: result[target] = po return result @@ -375,8 +376,10 @@ def page_object_for_item( See example: :ref:`rules-page_object_for_item-example`. """ + if item_cls is None: + return None matcher = self._item_matchers.get(item_cls) - return self._match_url_for_po(url, matcher) + return self._match_url_for_page_object(url, matcher) def _walk_module(module: str) -> Iterable: From 1668e5fb1cb8e7aae762ce54f01ffce1c6713ce4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 13 Jan 2023 16:07:49 +0800 Subject: [PATCH 13/16] use DefaultDict and remove type ignores --- web_poet/rules.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index caa85062..746f5f12 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -6,7 +6,18 @@ import warnings from collections import defaultdict, deque from operator import attrgetter -from typing import Any, Dict, Iterable, List, Mapping, Optional, Type, TypeVar, Union +from typing import ( + Any, + DefaultDict, + Dict, + Iterable, + List, + Mapping, + Optional, + Type, + TypeVar, + Union, +) import attrs from url_matcher import Patterns, URLMatcher @@ -117,10 +128,12 @@ class ExampleComProductPage(WebPage[Product]): def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): self._rules: Dict[int, ApplyRule] = {} - self._overrides_matchers: Dict[ + self._overrides_matchers: DefaultDict[ Optional[Type[ItemPage]], URLMatcher ] = defaultdict(URLMatcher) - self._item_matchers: Dict[Optional[Type], URLMatcher] = defaultdict(URLMatcher) + self._item_matchers: DefaultDict[Optional[Type], URLMatcher] = defaultdict( + URLMatcher + ) # Ensures that URLMatcher is deterministic in returning a rule when # matching. As of url_macher==0.2.0, `url_matcher.URLMatcher._sort_domain` @@ -140,8 +153,7 @@ def __init__(self, *, rules: Optional[Iterable[ApplyRule]] = None): def add_rule(self, rule: ApplyRule) -> None: """Registers an :class:`web_poet.rules.ApplyRule` instance.""" - # Ignore the type since ``ApplyRule`` could have an empty ``.to_return`` - matched = self._item_matchers.get(rule.to_return) # type: ignore[arg-type] + matched = self._item_matchers.get(rule.to_return) if matched: # A common case when a page object subclasses another one with the # same URL pattern. From 16f05080bbe45b7e48369b8bd870c24bbecae779 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 13 Jan 2023 16:09:41 +0800 Subject: [PATCH 14/16] use better variable names inside overrides_for() Co-authored-by: Mikhail Korobov --- web_poet/rules.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/web_poet/rules.py b/web_poet/rules.py index 746f5f12..c368eb13 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -372,12 +372,12 @@ def overrides_for( See example: :ref:`rules-overrides_for-example`. """ result: Dict[Type[ItemPage], Type[ItemPage]] = {} - for target, matcher in self._overrides_matchers.items(): - if target is None: + for replaced_page, matcher in self._overrides_matchers.items(): + if replaced_page is None: continue - po = self._match_url_for_page_object(url, matcher) - if po: - result[target] = po + page = self._match_url_for_page_object(url, matcher) + if page: + result[replaced_page] = page return result def page_object_for_item( From 1ab1a43abf2a3dc4c490216ebb6bc301bec6522e Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 16 Jan 2023 19:51:58 +0800 Subject: [PATCH 15/16] fix misleading variable name in docs --- docs/page-objects/rules.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/page-objects/rules.rst b/docs/page-objects/rules.rst index c9572143..d521717d 100644 --- a/docs/page-objects/rules.rst +++ b/docs/page-objects/rules.rst @@ -436,11 +436,11 @@ item class that you want from a given webpage, you can use from web_poet import default_registry - item_cls = default_registry.page_object_for_item( + page_cls = default_registry.page_object_for_item( "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html", Book ) - print(item_cls) # BookPage + print(page_cls) # BookPage Using rules from External Packages From e86bf379f8ca1ddd9c6479499a1988e1b47b794a Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 16 Jan 2023 20:50:42 +0800 Subject: [PATCH 16/16] =?UTF-8?q?rename=20page=5Fobject=5Ffor=5Fitem()=20?= =?UTF-8?q?=E2=86=92=20page=5Fcls=5Ffor=5Fitem()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.rst | 2 +- docs/page-objects/rules.rst | 6 +++--- tests/test_rules.py | 4 ++-- web_poet/rules.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b343f8ae..e4a134e7 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,7 +9,7 @@ TBR * :meth:`web_poet.rules.RulesRegistry.add_rule` * :meth:`web_poet.rules.RulesRegistry.overrides_for` - * :meth:`web_poet.rules.RulesRegistry.page_object_for_item` + * :meth:`web_poet.rules.RulesRegistry.page_cls_for_item` * Improved the performance of :meth:`web_poet.rules.RulesRegistry.search` where passing a single parameter of either ``instead_of`` or ``to_return`` results diff --git a/docs/page-objects/rules.rst b/docs/page-objects/rules.rst index d521717d..c5597675 100644 --- a/docs/page-objects/rules.rst +++ b/docs/page-objects/rules.rst @@ -423,20 +423,20 @@ by passing the webpage URL. For example: It returns a :class:`Mapping` where the *key* represents the page object class that is overridden or replaced by the page object class in the *value*. -.. _rules-page_object_for_item-example: +.. _rules-page_cls_for_item-example: Identify the page object that could create the item """"""""""""""""""""""""""""""""""""""""""""""""""" Suppose you want to retrieve the page object class that is able to create the item class that you want from a given webpage, you can use -:meth:`~.RulesRegistry.page_object_for_item`. For example: +:meth:`~.RulesRegistry.page_cls_for_item`. For example: .. code-block:: python from web_poet import default_registry - page_cls = default_registry.page_object_for_item( + page_cls = default_registry.page_cls_for_item( "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html", Book ) diff --git a/tests/test_rules.py b/tests/test_rules.py index 0e6ca96f..88faa457 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -401,12 +401,12 @@ def test_overrides_for() -> None: } -def test_page_object_for_item() -> None: +def test_page_cls_for_item() -> None: # This is not associated with any rule. class FakeItem: pass - method = default_registry.page_object_for_item + method = default_registry.page_cls_for_item for cls in [str, RequestUrl, ResponseUrl]: url = cls("https://example.com") diff --git a/web_poet/rules.py b/web_poet/rules.py index c368eb13..232f537c 100644 --- a/web_poet/rules.py +++ b/web_poet/rules.py @@ -380,13 +380,13 @@ def overrides_for( result[replaced_page] = page return result - def page_object_for_item( + def page_cls_for_item( self, url: Union[_Url, str], item_cls: Type ) -> Optional[Type]: """Return the page object class associated with the given URL that's able to produce the given ``item_cls``. - See example: :ref:`rules-page_object_for_item-example`. + See example: :ref:`rules-page_cls_for_item-example`. """ if item_cls is None: return None