Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

create a weak_cache in Injector #184

Merged
merged 1 commit into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/providers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@ which Scrapy has. Although they are quite similar in its intended purpose,
could be anything that could stretch beyond Scrapy's ``Responses`` `(e.g. Network
Database queries, API Calls, AWS S3 files, etc)`.

.. note::

The :class:`scrapy_poet.injection.Injector` maintains a ``.weak_cache`` which
stores the instances created by the providers as long as the corresponding
:class:`scrapy.Request <scrapy.http.Request>` instance exists. This means that
the instances created by earlier providers can be accessed and reused by latter
providers. This is turned on by default and the instances are stored in memory.


Configuring providers
=====================
Expand Down
11 changes: 11 additions & 0 deletions scrapy_poet/injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pprint
import warnings
from typing import Any, Callable, Dict, List, Mapping, Optional, Set, Type, cast
from weakref import WeakKeyDictionary

import andi
from andi.typeutils import issubclass_safe
Expand Down Expand Up @@ -95,6 +96,11 @@ def init_cache(self): # noqa: D102
f"Cache enabled. Folder: {cache_path!r}. Caching errors: {self.caching_errors}"
)

# This is different from the cache above as it only stores instances as long
# as the request exists. This is useful for latter providers to re-use the
# already built instances by earlier providers.
self.weak_cache: WeakKeyDictionary[Request, Dict] = WeakKeyDictionary()

def available_dependencies_for_providers(
self, request: Request, response: Response
): # noqa: D102
Expand Down Expand Up @@ -294,6 +300,11 @@ def build_instances_from_providers(
)
instances.update(objs_by_type)

if self.weak_cache.get(request):
self.weak_cache[request].update(objs_by_type)
else:
self.weak_cache[request] = objs_by_type
Gallaecio marked this conversation as resolved.
Show resolved Hide resolved

if self.cache and not cache_hit:
# Save the results in the cache
self.cache[fingerprint] = serialize(objs)
Expand Down
10 changes: 10 additions & 0 deletions tests/test_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def callback(
ClsReqResponse: ClsReqResponse(),
ClsNoProviderRequired: ClsNoProviderRequired(),
}
assert injector.weak_cache.get(request).keys() == {ClsReqResponse, Cls1, Cls2}

instances = yield from injector.build_instances_from_providers(
request, response, plan
Expand All @@ -212,6 +213,7 @@ def callback(
Cls2: Cls2(),
ClsReqResponse: ClsReqResponse(),
}
assert injector.weak_cache.get(request).keys() == {ClsReqResponse, Cls1, Cls2}

@inlineCallbacks
def test_build_instances_from_providers_unexpected_return(self):
Expand All @@ -230,6 +232,7 @@ def callback(response: DummyResponse, a: Cls1):
yield from injector.build_instances_from_providers(
response.request, response, plan
)
assert injector.weak_cache.get(response.request) is None

assert "Provider" in str(exinf.value)
assert "Cls2" in str(exinf.value)
Expand All @@ -256,6 +259,7 @@ def callback(response: DummyResponse, arg: str):
instances = yield from injector.build_instances_from_providers(
response.request, response, plan
)
assert injector.weak_cache.get(response.request).keys() == {str}

assert instances[str] == min(str_list)

Expand Down Expand Up @@ -628,6 +632,7 @@ def callback_factory():
if name.startswith(prefix)
}
assert set(poet_stats) == expected
assert injector.weak_cache.get(response.request) is None

@inlineCallbacks
def test_po_provided_via_item(self):
Expand All @@ -642,6 +647,7 @@ def callback(response: DummyResponse, item: TestItem):
_ = yield from injector.build_callback_dependencies(response.request, response)
key = "poet/injector/tests.test_injection.TestItemPage"
assert key in set(injector.crawler.stats.get_stats())
assert injector.weak_cache.get(response.request) is None


class TestInjectorOverrides:
Expand Down Expand Up @@ -787,6 +793,7 @@ def callback(response: DummyResponse, arg_price: Price, arg_name: Name):
response.request, response, plan
)
assert cache.exists()
assert injector.weak_cache.get(response.request).keys() == {Price, Name}

validate_instances(instances)

Expand All @@ -799,6 +806,7 @@ def callback(response: DummyResponse, arg_price: Price, arg_name: Name):
instances = yield from injector.build_instances_from_providers(
response.request, response, plan
)
assert injector.weak_cache.get(response.request) is None

# Different providers. They return a different result, but the cache data should prevail.
providers = {
Expand All @@ -812,6 +820,7 @@ def callback(response: DummyResponse, arg_price: Price, arg_name: Name):
instances = yield from injector.build_instances_from_providers(
response.request, response, plan
)
assert injector.weak_cache.get(response.request).keys() == {Price, Name}

validate_instances(instances)

Expand All @@ -823,3 +832,4 @@ def callback(response: DummyResponse, arg_price: Price, arg_name: Name):
instances = yield from injector.build_instances_from_providers(
response.request, response, plan
)
assert injector.weak_cache.get(response.request) is None
Loading