Skip to content

Commit

Permalink
warn the user when the same URL pattern is present in the rule
Browse files Browse the repository at this point in the history
  • Loading branch information
BurnzZ committed Oct 13, 2022
1 parent fc9a9d9 commit dd565c3
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 131 deletions.
22 changes: 22 additions & 0 deletions scrapy_poet/overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Callable, Dict, Iterable, List, Mapping, Optional, Tuple, Type, Union
from warnings import warn

from scrapy import Request
from scrapy.crawler import Crawler
Expand Down Expand Up @@ -110,6 +111,26 @@ def add_rule(self, rule: RuleFromUser) -> None:
for_patterns=Patterns([pattern]), use=use, instead_of=instead_of
)
self.rules.append(rule)

# A common case when a PO subclasses another one with the same URL
# pattern. See the test_item_return_subclass() test case.
matched = self.matcher[rule.to_return]
if [
pattern
for pattern in matched.patterns.values()
if pattern == rule.for_patterns
]:
# TODO: It would be great to also list down the rules having the
# same URL pattern. But this would require some refactoring.
warn(

This comment has been minimized.

Copy link
@kmike

kmike Oct 13, 2022

Member

shouldn't we warn about it in web-poet?

This comment has been minimized.

Copy link
@BurnzZ

BurnzZ Oct 14, 2022

Author Contributor

This was my initial approach when writing it but I encountered the following use case.


Suppose a user tries to use a variety of rules from a few POPs:

from web_poet import default_registry, consume_modules

consume_modules("package_A", "package_B", "package")

rules = default_registry.get_rules()

Let's say the user wants to filter out a few rules, maybe improve on others, replace a few things etc., or better yet, ensure that there are no duplicate URL Patterns for a given to_return and/or instead_of.

rules = remove_some_things(rules)
rules = replace_some_things(rules)
rules = prioritize_some_things(rules)
# etc.

# and finally assign it to scrapy-poet
settings["SCRAPY_POET_OVERRIDES"] = rules

The issue with placing the warnings within web-poet is that the user's additional step of mangling with the rules before assigning it to scrapy-poet doesn't affect the logged warnings at all. In this case, adding it to scrapy-poet ensures that the warning is only logged when it's actually being used.

I also view web-poet as a library of rules (i.e. POPs) which could almost always have duplicate URL patterns for the same to_item and/or to_return. This should generally be okay since they could come from different authors/organizations which uses the default priority=500.

f"A similar URL pattern {list(matched.patterns.values())} has been "
f"declared earlier which uses to_return={rule.to_return}. When "
f"matching URLs against rules, the latest declared rule is used. "
f"Consider explicitly updating the priority of the rules containing "
f"the said URL pattern to easily match the expectations when reading "
f"the code."
)

if rule.instead_of:
self.matcher[rule.instead_of].add_or_update(
len(self.rules) - 1, rule.for_patterns
Expand All @@ -127,6 +148,7 @@ def overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
overrides[instead_of] = self.rules[rule_id].use
return overrides

# TODO: Refactor later
def rules_overrides_for(self, request: Request) -> Mapping[Callable, Callable]:
overrides: Dict[Callable, Callable] = {}
for instead_of, matcher in self.matcher.items():
Expand Down
111 changes: 0 additions & 111 deletions tests/po_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,122 +3,11 @@
"""
import socket

import attrs
from url_matcher.util import get_domain
from web_poet import Injectable, ItemPage, WebPage, field, handle_urls, item_from_fields

from tests.mockserver import get_ephemeral_port

# Need to define it here since it's always changing
DOMAIN = get_domain(socket.gethostbyname(socket.gethostname()))
PORT = get_ephemeral_port()
URL = f"{DOMAIN}:{PORT}"


class POOverridenPage(WebPage):
def to_item(self):
return {"msg": "PO that will be replaced"}


@handle_urls(URL, instead_of=POOverridenPage)
class POIntegrationPage(WebPage):
def to_item(self):
return {"msg": "PO replacement"}


@attrs.define
class Product:
name: str


@attrs.define
class ParentProduct:
name: str


@attrs.define
class ReplacedProduct:
name: str


@attrs.define
class ParentReplacedProduct:
name: str


@attrs.define
class SubclassReplacedProduct:
name: str


@attrs.define
class StandaloneProduct:
name: str


@attrs.define
class ProductFromInjectable:
name: str


@handle_urls(URL)
class ProductPage(ItemPage[Product]):
@field
def name(self) -> str:
return "product's name"


@handle_urls(URL)
class ParentProductPage(ItemPage[ParentProduct]):
@field
def name(self) -> str:
return "parent product's name"


@handle_urls(URL, priority=600)
class SubclassProductPage(ParentProductPage):
@field
def name(self) -> str:
return "subclass product's name"


@handle_urls(URL, to_return=ReplacedProduct)
class ReplacedProductPage(ItemPage[Product]):
@field
def name(self) -> str:
return "replaced product's name"


@handle_urls(URL)
class ParentReplacedProductPage(ItemPage[ParentReplacedProduct]):
@field
def name(self) -> str:
return "parent replaced product's name"


@handle_urls(URL, to_return=SubclassReplacedProduct)
class SubclassReplacedProductPage(ParentReplacedProductPage):
@field
def name(self) -> str:
return "subclass replaced product's name"


@handle_urls(URL, to_return=StandaloneProduct)
class StandaloneProductPage(ItemPage):
@field
def name(self) -> str:
return "standalone product's name"


# TODO: cases where `instead_of` and `to_return` are present, including
# permutations of the cases above


@handle_urls(URL, to_return=ProductFromInjectable)
class ProductFromInjectablePage(Injectable):
@field
def name(self) -> str:
return "product from injectable"

async def to_item(self) -> ProductFromInjectable:
return await item_from_fields(self, item_cls=ProductFromInjectable)
111 changes: 111 additions & 0 deletions tests/po_lib/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import attrs
from web_poet import Injectable, ItemPage, WebPage, field, handle_urls, item_from_fields

from . import URL


class POOverridenPage(WebPage):
def to_item(self):
return {"msg": "PO that will be replaced"}


@handle_urls(URL, instead_of=POOverridenPage)
class POIntegrationPage(WebPage):
def to_item(self):
return {"msg": "PO replacement"}


@attrs.define
class Product:
name: str


@attrs.define
class ParentProduct:
name: str


@attrs.define
class PriorityParentProduct:
name: str


@attrs.define
class ReplacedProduct:
name: str


@attrs.define
class ParentReplacedProduct:
name: str


@attrs.define
class SubclassReplacedProduct:
name: str


@attrs.define
class StandaloneProduct:
name: str


@attrs.define
class ProductFromInjectable:
name: str


@handle_urls(URL)
class ProductPage(ItemPage[Product]):
@field
def name(self) -> str:
return "product's name"


@handle_urls(URL)
class ParentProductPage(ItemPage[ParentProduct]):
@field
def name(self) -> str:
return "parent product's name"


@handle_urls(URL)
class PriorityParentProductPage(ItemPage[PriorityParentProduct]):
@field
def name(self) -> str:
return "priority parent product's name"


@handle_urls(URL, to_return=ReplacedProduct)
class ReplacedProductPage(ItemPage[Product]):
@field
def name(self) -> str:
return "replaced product's name"


@handle_urls(URL)
class ParentReplacedProductPage(ItemPage[ParentReplacedProduct]):
@field
def name(self) -> str:
return "parent replaced product's name"


@handle_urls(URL, to_return=StandaloneProduct)
class StandaloneProductPage(ItemPage):
@field
def name(self) -> str:
return "standalone product's name"


# TODO: cases where `instead_of` and `to_return` are present, including
# permutations of the cases above


@handle_urls(URL, to_return=ProductFromInjectable)
class ProductFromInjectablePage(Injectable):
@field
def name(self) -> str:
return "product from injectable"

async def to_item(self) -> ProductFromInjectable:
return await item_from_fields(self, item_cls=ProductFromInjectable)
30 changes: 30 additions & 0 deletions tests/po_lib/subclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from web_poet import field, handle_urls

from . import URL
from .main import (
ParentProductPage,
ParentReplacedProductPage,
PriorityParentProductPage,
SubclassReplacedProduct,
)


@handle_urls(URL)
class SubclassProductPage(ParentProductPage):
@field
def name(self) -> str:
return "subclass product's name"


@handle_urls(URL, to_return=SubclassReplacedProduct)
class SubclassReplacedProductPage(ParentReplacedProductPage):
@field
def name(self) -> str:
return "subclass replaced product's name"


@handle_urls(URL, priority=600)
class PrioritySubclassProductPage(PriorityParentProductPage):
@field
def name(self) -> str:
return "priority subclass product's name"
Loading

0 comments on commit dd565c3

Please sign in to comment.