Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for field processors. #85

Merged
merged 8 commits into from
Oct 15, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions docs/advanced/fields.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,34 @@ function when accessing the fields:
Now any field can be converted from sync to async, or the other way around,
and the code would keep working.

Field processors
----------------

It's often needed to clean or process field values using reusable functions.
``@field`` takes an optional ``out`` argument with a list of such functions.
They will be applied to the field value before returning it:

.. code-block:: python

from web_poet import ItemPage, HttpResponse, field

def clean_tabs(s):
return s.replace('\t', ' ')

class MyPage(ItemPage):
response: HttpResponse

@field(out=[clean_tabs, str.strip])
def name(self):
return self.response.css(".name ::text").get()

Note that while processors can be applied to async fields, they need to be
normal functions themselves.
kmike marked this conversation as resolved.
Show resolved Hide resolved

It's also possible to implement field cleaning and processing in ``to_item``
but in that case accessing a field directly will return the value without
processing, so it's preferable to use field processors instead.

Item classes
------------

Expand Down Expand Up @@ -198,9 +226,6 @@ but item classes are of a great help when
* you need to extract data in the same format from multiple websites, or
* if you want to define the schema upfront.

Item classes can also be used to hold common attribute
pre-processing and validation logic.

Error prevention
~~~~~~~~~~~~~~~~

Expand Down
29 changes: 29 additions & 0 deletions tests/test_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,3 +368,32 @@ def field_foo_cached(self):
assert page.field_foo == "foo"
assert page.field_foo_meta == "foo"
assert page.field_foo_cached == "foo"


def test_field_processors_sync() -> None:
def proc1(s):
return s + "x"

@attrs.define
class Page(ItemPage):
@field(out=[str.strip, proc1])
def name(self): # noqa: D102
return " name\t "

page = Page()
assert page.name == "namex"


@pytest.mark.asyncio
async def test_field_processors_async() -> None:
def proc1(s):
return s + "x"

@attrs.define
class Page(ItemPage):
@field(out=[str.strip, proc1])
async def name(self): # noqa: D102
return " name\t "

page = Page()
assert await page.name == "namex"
44 changes: 40 additions & 4 deletions web_poet/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
``web_poet.fields`` is a module with helpers for putting extraction logic
into separate Page Object methods / properties.
"""
from functools import update_wrapper
from typing import Dict, List, Optional, Type, TypeVar
import inspect
from functools import update_wrapper, wraps
from typing import Callable, Dict, List, Optional, Type, TypeVar

import attrs
from itemadapter import ItemAdapter
Expand All @@ -24,6 +25,9 @@ class FieldInfo:
#: field metadata
meta: Optional[dict] = None

#: field processors
out: Optional[List[Callable]] = None


class FieldsMixin:
"""A mixin which is required for a class to support fields"""
Expand All @@ -43,7 +47,13 @@ def __init_subclass__(cls, **kwargs):
delattr(cls, _FIELDS_INFO_ATTRIBUTE_WRITE)


def field(method=None, *, cached: bool = False, meta: Optional[dict] = None):
def field(
method=None,
*,
cached: bool = False,
meta: Optional[dict] = None,
out: Optional[List[Callable]] = None,
):
"""
Page Object method decorated with ``@field`` decorator becomes a property,
which is then used by :class:`~.ItemPage`'s to_item() method to populate
Expand All @@ -55,6 +65,9 @@ def field(method=None, *, cached: bool = False, meta: Optional[dict] = None):
The ``meta`` parameter allows to store arbitrary information for the field,
e.g. ``@field(meta={"expensive": True})``. This information can be later
retrieved for all fields using the :func:`get_fields_dict` function.

The ``out`` parameter is an optional list of field processors, which are
kmike marked this conversation as resolved.
Show resolved Hide resolved
functions applied to the value of the field before returning it.
"""

class _field:
Expand All @@ -63,6 +76,9 @@ def __init__(self, method):
raise TypeError(
f"@field decorator must be used on methods, {method!r} is decorated instead"
)
if out:
method = self._processed(method)
kmike marked this conversation as resolved.
Show resolved Hide resolved

if cached:
self.unbound_method = cached_method(method)
else:
Expand All @@ -72,12 +88,32 @@ def __set_name__(self, owner, name):
if not hasattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE):
setattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE, {})

field_info = FieldInfo(name=name, meta=meta)
field_info = FieldInfo(name=name, meta=meta, out=out)
getattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE)[name] = field_info

def __get__(self, instance, owner=None):
return self.unbound_method(instance)

@staticmethod
def _process(value):
for processor in out:
value = processor(value)
return value

def _processed(self, method):
"""Returns a wrapper for method that calls processors on its result"""
kmike marked this conversation as resolved.
Show resolved Hide resolved
if inspect.iscoroutinefunction(method):

async def processed(*args, **kwargs):
return self._process(await method(*args, **kwargs))

else:

def processed(*args, **kwargs):
return self._process(method(*args, **kwargs))

return wraps(method)(processed)

if method is not None:
# @field syntax
res = _field(method)
Expand Down