Skip to content

Commit

Permalink
Merge branch 'master' of ssh://github.com/scrapinghub/web-poet into h…
Browse files Browse the repository at this point in the history
…andle_urls-with-item
  • Loading branch information
BurnzZ committed Oct 17, 2022
2 parents 0626e57 + 551aa3b commit 5fdf4a1
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 7 deletions.
31 changes: 28 additions & 3 deletions docs/advanced/fields.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,34 @@ function when accessing the fields:
Now any field can be converted from sync to async, or the other way around,
and the code would keep working.

Field processors
----------------

It's often needed to clean or process field values using reusable functions.
``@field`` takes an optional ``out`` argument with a list of such functions.
They will be applied to the field value before returning it:

.. code-block:: python
from web_poet import ItemPage, HttpResponse, field
def clean_tabs(s):
return s.replace('\t', ' ')
class MyPage(ItemPage):
response: HttpResponse
@field(out=[clean_tabs, str.strip])
def name(self):
return self.response.css(".name ::text").get()
Note that while processors can be applied to async fields, they need to be
sync functions themselves.

It's also possible to implement field cleaning and processing in ``to_item``
but in that case accessing a field directly will return the value without
processing, so it's preferable to use field processors instead.

.. _item-classes:

Item classes
Expand Down Expand Up @@ -200,9 +228,6 @@ but item classes are of a great help when
* you need to extract data in the same format from multiple websites, or
* if you want to define the schema upfront.

Item classes can also be used to hold common attribute
pre-processing and validation logic.

Error prevention
~~~~~~~~~~~~~~~~

Expand Down
29 changes: 29 additions & 0 deletions tests/test_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,3 +422,32 @@ async def test_field_with_handle_urls() -> None:
assert page.name == "name"
assert page.price == 12.99
assert await page.to_item() == Product(name="name", price=12.99)


def test_field_processors_sync() -> None:
def proc1(s):
return s + "x"

@attrs.define
class Page(ItemPage):
@field(out=[str.strip, proc1])
def name(self): # noqa: D102
return " name\t "

page = Page()
assert page.name == "namex"


@pytest.mark.asyncio
async def test_field_processors_async() -> None:
def proc1(s):
return s + "x"

@attrs.define
class Page(ItemPage):
@field(out=[str.strip, proc1])
async def name(self): # noqa: D102
return " name\t "

page = Page()
assert await page.name == "namex"
39 changes: 39 additions & 0 deletions tests_typing/test_fields.mypy-testing
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,50 @@ class Page(ItemPage):
return "hello"


def process_price(value: float) -> float:
return max([0, value])


class TypedPage(ItemPage):
@field
def description(self) -> str:
return "hello"

@field(out=[str.strip])
def name(self) -> str:
return "hello"

@field(out=[process_price, str])
def price(self) -> float:
return 123.0


@attrs.define
class Item:
name: str


@pytest.mark.mypy_testing
@pytest.mark.xfail
async def test_field_type() -> None:
page = TypedPage()
reveal_type(page.description) # R: builtins.str


@pytest.mark.mypy_testing
@pytest.mark.xfail
async def test_field_type_out() -> None:
page = TypedPage()
reveal_type(page.name) # R: builtins.str


@pytest.mark.mypy_testing
@pytest.mark.xfail
async def test_field_type_changed_type() -> None:
page = TypedPage()
reveal_type(page.price) # R: builtins.str


@pytest.mark.mypy_testing
async def test_item_from_fields() -> None:
page = Page()
Expand Down
44 changes: 40 additions & 4 deletions web_poet/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
``web_poet.fields`` is a module with helpers for putting extraction logic
into separate Page Object methods / properties.
"""
from functools import update_wrapper
from typing import Dict, List, Optional, Type, TypeVar
import inspect
from functools import update_wrapper, wraps
from typing import Callable, Dict, List, Optional, Type, TypeVar

import attrs
from itemadapter import ItemAdapter
Expand All @@ -24,6 +25,9 @@ class FieldInfo:
#: field metadata
meta: Optional[dict] = None

#: field processors
out: Optional[List[Callable]] = None


class FieldsMixin:
"""A mixin which is required for a class to support fields"""
Expand All @@ -43,7 +47,13 @@ def __init_subclass__(cls, **kwargs):
delattr(cls, _FIELDS_INFO_ATTRIBUTE_WRITE)


def field(method=None, *, cached: bool = False, meta: Optional[dict] = None):
def field(
method=None,
*,
cached: bool = False,
meta: Optional[dict] = None,
out: Optional[List[Callable]] = None,
):
"""
Page Object method decorated with ``@field`` decorator becomes a property,
which is then used by :class:`~.ItemPage`'s to_item() method to populate
Expand All @@ -55,6 +65,9 @@ def field(method=None, *, cached: bool = False, meta: Optional[dict] = None):
The ``meta`` parameter allows to store arbitrary information for the field,
e.g. ``@field(meta={"expensive": True})``. This information can be later
retrieved for all fields using the :func:`get_fields_dict` function.
The ``out`` parameter is an optional list of field processors, which are
functions applied to the value of the field before returning it.
"""

class _field:
Expand All @@ -63,6 +76,7 @@ def __init__(self, method):
raise TypeError(
f"@field decorator must be used on methods, {method!r} is decorated instead"
)
method = self._processed(method)
if cached:
self.unbound_method = cached_method(method)
else:
Expand All @@ -72,12 +86,34 @@ def __set_name__(self, owner, name):
if not hasattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE):
setattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE, {})

field_info = FieldInfo(name=name, meta=meta)
field_info = FieldInfo(name=name, meta=meta, out=out)
getattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE)[name] = field_info

def __get__(self, instance, owner=None):
return self.unbound_method(instance)

@staticmethod
def _process(value):
for processor in out:
value = processor(value)
return value

def _processed(self, method):
"""Returns a wrapper for method that calls processors on its result"""
if not out:
return method
if inspect.iscoroutinefunction(method):

async def processed(*args, **kwargs):
return self._process(await method(*args, **kwargs))

else:

def processed(*args, **kwargs):
return self._process(method(*args, **kwargs))

return wraps(method)(processed)

if method is not None:
# @field syntax
res = _field(method)
Expand Down

0 comments on commit 5fdf4a1

Please sign in to comment.