Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Annotated[Item, PickFields("x", "y")] to decide which fields to populate in callback #111

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/api_reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ API
:members:
:no-special-members:

.. autoclass:: scrapy_poet.PickFields
:members:
:no-special-members:

.. autoclass:: scrapy_poet.NotPickFields
:members:
:no-special-members:

Injection Middleware
====================

Expand Down
2 changes: 1 addition & 1 deletion scrapy_poet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .api import DummyResponse, callback_for
from .api import DummyResponse, NotPickFields, PickFields, callback_for
from .downloadermiddlewares import InjectionMiddleware
from .page_input_providers import (
CacheDataProviderMixin,
Expand Down
35 changes: 34 additions & 1 deletion scrapy_poet/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from inspect import iscoroutinefunction
from typing import Callable, Optional, Type
from typing import Callable, Iterable, Optional, Type

from scrapy.http import Request, Response
from web_poet.pages import ItemPage
Expand Down Expand Up @@ -130,3 +130,36 @@ def parse(*args, item: page_or_item_cls, **kwargs): # type:ignore

setattr(parse, _CALLBACK_FOR_MARKER, True)
return parse


class _FieldController:
def __init__(self, *args: Iterable[str]):
self.fields = tuple(args)


class PickFields(_FieldController):
"""To be used alongside :class:`typing.Annotated` to indicate an **inclusion**
list of fields which would be populated in an item.

It accepts an arbitrary number of strings.

.. code-block:: python

Annotated[BigItem, PickFields("x", "y")]
"""

pass


class NotPickFields(_FieldController):
"""To be used alongside :class:`typing.Annotated` to indicate an **exclusion**
list of fields which would be populated in an item.

It accepts an arbitrary number of strings.

.. code-block:: python

Annotated[BigItem, NotPickFields("x", "y")]
"""

pass
17 changes: 15 additions & 2 deletions scrapy_poet/injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
UndeclaredProvidedTypeError,
)
from scrapy_poet.page_input_providers import PageObjectInputProvider
from scrapy_poet.utils import _normalize_annotated_cls

from .utils import create_registry_instance, get_scrapy_data_path

Expand Down Expand Up @@ -155,10 +156,18 @@ def build_instances(self, request: Request, response: Response, plan: andi.Plan)
)
# All the remaining dependencies are internal so they can be built just
# following the andi plan.
for cls, kwargs_spec in plan.dependencies:
for raw_cls, kwargs_spec in plan.dependencies:
# Need to handle both typing.Annotated[cls, PickFields(...)] and cls.
cls = _normalize_annotated_cls(raw_cls)

if cls not in instances.keys():
instances[cls] = cls(**kwargs_spec.kwargs(instances))

# andi could still be expecting this signature, if there is,
# typing.Annotated[cls, PickFields(...)]
if raw_cls not in instances.keys():
instances[raw_cls] = instances[cls]

return instances

@inlineCallbacks
Expand Down Expand Up @@ -226,7 +235,11 @@ def build_instances_from_providers(
raise

objs_by_type: Dict[Callable, Any] = {type(obj): obj for obj in objs}
extra_classes = objs_by_type.keys() - provided_classes
extra_classes = objs_by_type.keys() - (
# ensure that cls from typing.Annotated[cls, PickFields(...)]
# is used when comparing.
{_normalize_annotated_cls(p) for p in provided_classes}
)
if extra_classes:
raise UndeclaredProvidedTypeError(
f"{provider} has returned instances of types {extra_classes} "
Expand Down
29 changes: 27 additions & 2 deletions scrapy_poet/page_input_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,21 @@
HttpClient,
HttpResponse,
HttpResponseHeaders,
ItemPage,
PageParams,
RequestUrl,
ResponseUrl,
)
from web_poet.fields import item_from_fields_sync
from web_poet.pages import is_injectable
from web_poet.utils import ensure_awaitable

from scrapy_poet.downloader import create_scrapy_downloader
from scrapy_poet.injection_errors import (
MalformedProvidedClassesError,
ProviderDependencyDeadlockError,
)
from scrapy_poet.utils import _derive_fields, _normalize_annotated_cls


class PageObjectInputProvider:
Expand Down Expand Up @@ -301,6 +305,7 @@ def provided_classes(self, cls):
"""If the item is in any of the ``to_return`` in the rules, then it can
definitely provide by using the corresponding page object in ``use``.
"""
cls = _normalize_annotated_cls(cls)
return isclass(cls) and self.registry.search(to_return=cls)

def update_cache(self, request: Request, mapping: Dict[Type, Any]) -> None:
Expand All @@ -318,7 +323,9 @@ async def __call__(
response: Response,
) -> List[Any]:
results = []
for cls in to_provide:
for raw_item_cls in to_provide:
cls = _normalize_annotated_cls(raw_item_cls)

item = self.get_from_cache(request, cls)
if item:
results.append(item)
Expand Down Expand Up @@ -349,10 +356,28 @@ async def __call__(
)

page_object = po_instances[page_object_cls]
item = await page_object.to_item()
item = await self._produce_item(raw_item_cls, page_object)

self.update_cache(request, po_instances)
self.update_cache(request, {type(item): item})

results.append(item)
return results

async def _produce_item(self, cls_or_annotated: Any, page_object: ItemPage) -> Any:
field_names = _derive_fields(cls_or_annotated, page_object)
if field_names:
item_dict = item_from_fields_sync(
page_object, item_cls=dict, skip_nonitem_fields=False
)
item_cls = _normalize_annotated_cls(cls_or_annotated)
item = item_cls(
**{
name: await ensure_awaitable(item_dict[name])
for name in item_dict
if name in field_names
}
)
return item

return await page_object.to_item()
63 changes: 61 additions & 2 deletions scrapy_poet/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import os
from typing import Type
from typing import Any, Collection, Optional, Type
from warnings import warn

try:
from typing import Annotated # Python 3.9+
except ImportError:
from typing_extensions import _AnnotatedAlias as Annotated

from scrapy.crawler import Crawler
from scrapy.http import Request, Response
from scrapy.utils.project import inside_project, project_data_dir
from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders
from web_poet import HttpRequest, HttpResponse, HttpResponseHeaders, ItemPage
from web_poet.fields import get_fields_dict

from scrapy_poet.api import NotPickFields, PickFields


def get_scrapy_data_path(createdir: bool = True, default_dir: str = ".scrapy") -> str:
Expand Down Expand Up @@ -48,6 +56,57 @@ def scrapy_response_to_http_response(response: Response) -> HttpResponse:
)


def get_origin(cls: Any) -> Any:
"""Offers backward compatibility for Python 3.7 since ``typing.get_origin(tp)``
is only available starting on 3.8.

Moreover, ``typing_extensions.get_origin`` doesn't work well with
``typing_extensions.Annotated``.
"""
return getattr(cls, "__origin__", ())


def _normalize_annotated_cls(cls: Any) -> Any:
"""Returns the type ``T`` in ``typing.Annotated[T, x]``, if applicable.

See: https://peps.python.org/pep-0593/
"""
if isinstance(cls, Annotated):
cls = get_origin(cls)
return cls


def _derive_fields(annotation: Any, page_obj: ItemPage) -> Optional[Collection[str]]:
"""Returns a Collection of strings representing the fields names to extract
from the page object based on the annotations specified on its item:

- ``typing.Annotated[T, PickFields(x, ...)]``
- ``typing.Annotated[T, NotPickFields(x, ...)]``
"""

if not isinstance(annotation, Annotated):
return None

def _use_fields(_fields, values):
if _fields:
raise ValueError("PickFields and NotPickFields should not be used together")
return values

fields = []

for metadata in annotation.__metadata__:
if isinstance(metadata, PickFields):
fields = _use_fields(fields, metadata.fields)

if isinstance(metadata, NotPickFields):
if metadata.fields:
fields = _use_fields(
fields, get_fields_dict(page_obj).keys() - set(metadata.fields)
)

return fields


def create_registry_instance(cls: Type, crawler: Crawler):
if "SCRAPY_POET_OVERRIDES" in crawler.settings:
msg = (
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"twisted >= 18.9.0",
"url-matcher >= 0.2.0",
"web-poet >= 0.7.0",
"typing_extensions >= 4.4.0; python_version<'3.9'",
],
classifiers=[
"Development Status :: 3 - Alpha",
Expand Down
Loading