Skip to content

Commit

Permalink
Constrained spill dask#5543
Browse files Browse the repository at this point in the history
  • Loading branch information
crusaderky committed Feb 14, 2022
1 parent d3f3012 commit 561cb53
Show file tree
Hide file tree
Showing 7 changed files with 705 additions and 122 deletions.
3 changes: 2 additions & 1 deletion continuous_integration/environment-3.9.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ dependencies:
- pip:
- git+https://github.com/dask/dask
- git+https://github.com/dask/s3fs
- git+https://github.com/dask/zict
- git+https://github.com/ncclementi/zict@slow_raises #remove this after zict merged
#- git+https://github.com/dask/zict
# FIXME https://github.com/dask/distributed/issues/5345
# - git+https://github.com/intake/filesystem_spec
- git+https://github.com/joblib/joblib
Expand Down
12 changes: 10 additions & 2 deletions distributed/distributed-schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ properties:
http:
type: object
decription: Settings for Dask's embedded HTTP Server
description: Settings for Dask's embedded HTTP Server
properties:
routes:
type: array
Expand Down Expand Up @@ -504,9 +504,17 @@ properties:
When the process memory reaches this level the nanny process will kill
the worker (if a nanny is present)
max-spill:
oneOf:
- type: string
- {type: number, minimum: 0}
- enum: [false]
description: >-
Limit of number of bytes to be spilled on disk.
http:
type: object
decription: Settings for Dask's embedded HTTP Server
description: Settings for Dask's embedded HTTP Server
properties:
routes:
type: array
Expand Down
4 changes: 4 additions & 0 deletions distributed/distributed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ distributed:
pause: 0.80 # fraction at which we pause worker threads
terminate: 0.95 # fraction at which we terminate the worker

# Max size of the spill file on disk (e.g. "10 GB")
# Set to false for no maximum.
max-spill: false

http:
routes:
- distributed.http.worker.prometheus
Expand Down
265 changes: 218 additions & 47 deletions distributed/spill.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,247 @@
from __future__ import annotations

from collections.abc import Hashable, Mapping
import logging
import time
from collections.abc import Mapping
from contextlib import contextmanager
from functools import partial
from typing import Any
from typing import TYPE_CHECKING, Any

from zict import Buffer, File, Func
import zict
from packaging.version import parse as parse_version

if TYPE_CHECKING:
from typing_extensions import Literal

from .protocol import deserialize_bytes, serialize_bytelist
from .sizeof import safe_sizeof

logger = logging.getLogger(__name__)


class SpillBuffer(Buffer):
class SpillBuffer(zict.Buffer):
"""MutableMapping that automatically spills out dask key/value pairs to disk when
the total size of the stored data exceeds the target
the total size of the stored data exceeds the target. If max_spill is provided the
key/value pairs won't be spilled once this threshold has been reached.
Paramaters
----------
spill_directory: str
Location on disk to write the spill files to
target: int
Managed memory, in bytes, to start spilling at
max_spill: int | False, optional
Limit of number of bytes to be spilled on disk. Set to False to disable.
min_log_interval: float, optional
Minimum interval, in seconds, between warnings on the log file about full disk
"""

spilled_by_key: dict[Hashable, int]
spilled_total: int
last_logged: float
min_log_interval: float
logged_pickle_errors: set[str]

def __init__(
self,
spill_directory: str,
target: int,
max_spill: int | Literal[False] = False,
min_log_interval: float = 2,
):

if max_spill is not False and parse_version(zict.__version__) <= parse_version(
"2.0.0"
):
raise ValueError("zict > 2.0.0 required to set max_weight")

def __init__(self, spill_directory: str, target: int):
self.spilled_by_key = {}
self.spilled_total = 0
storage = Func(
partial(serialize_bytelist, on_error="raise"),
deserialize_bytes,
File(spill_directory),
)
super().__init__(
{},
storage,
target,
weight=self._weight,
fast_to_slow_callbacks=[self._on_evict],
slow_to_fast_callbacks=[self._on_retrieve],
fast={},
slow=Slow(spill_directory, max_spill),
n=target,
weight=_in_memory_weight,
)
self.last_logged = 0
self.min_log_interval = min_log_interval
self.logged_pickle_errors = set() # keys logged with pickle error

@contextmanager
def handle_errors(self, key: str | None):
try:
yield
except MaxSpillExceeded as e:
# key is in self.fast; no keys have been lost on eviction
# Note: requires zict > 2.0
(key_e,) = e.args
assert key_e in self.fast
assert key_e not in self.slow
now = time.time()
if now - self.last_logged >= self.min_log_interval:
logger.warning(
"Spill file on disk reached capacity; keeping data in memory"
)
self.last_logged = now
raise HandledError()
except OSError:
# Typically, this is a disk full error
now = time.time()
if now - self.last_logged >= self.min_log_interval:
logger.error(
"Spill to disk failed; keeping data in memory", exc_info=True
)
self.last_logged = now
raise HandledError()
except PickleError as e:
key_e, orig_e = e.args
assert key_e in self.fast
assert key_e not in self.slow
if key_e == key:
assert key is not None
# The key we just inserted failed to serialize.
# This happens only when the key is individually larger than target.
# The exception will be caught by Worker and logged; the status of
# the task will be set to error.
del self[key]
raise orig_e
else:
# The key we just inserted is smaller than target, but it caused
# another, unrelated key to be spilled out of the LRU, and that key
# failed to serialize. There's nothing wrong with the new key. The older
# key is still in memory.
if key_e not in self.logged_pickle_errors:
logger.error(f"Failed to pickle {key_e!r}", exc_info=True)
self.logged_pickle_errors.add(key_e)
raise HandledError()

def __setitem__(self, key: str, value: Any) -> None:
"""If sizeof(value) < target, write key/value pair to self.fast; this may in
turn cause older keys to be spilled from fast to slow.
If sizeof(value) >= target, write key/value pair directly to self.slow instead.
Raises
------
Exception
sizeof(value) >= target, and value failed to pickle.
The key/value pair has been forgotten.
In all other cases:
- an older value was evicted and failed to pickle,
- this value or an older one caused the disk to fill and raise OSError,
- this value or an older one caused the max_spill threshold to be exceeded,
this method does not raise and guarantees that the key/value that caused the
issue remained in fast.
"""
try:
with self.handle_errors(key):
super().__setitem__(key, value)
self.logged_pickle_errors.discard(key)
except HandledError:
assert key in self.fast
assert key not in self.slow

def evict(self) -> int:
"""Manually evict the oldest key/value pair, even if target has not been reached.
Returns sizeof(value).
If the eviction failed (value failed to pickle, disk full, or max_spill
exceeded), return -1; the key/value pair that caused the issue will remain in
fast. This method never raises.
"""
try:
with self.handle_errors(None):
_, _, weight = self.fast.evict()
return weight
except HandledError:
return -1

def __delitem__(self, key: str) -> None:
super().__delitem__(key)
self.logged_pickle_errors.discard(key)

@property
def memory(self) -> Mapping[Hashable, Any]:
def memory(self) -> Mapping[str, Any]:
"""Key/value pairs stored in RAM. Alias of zict.Buffer.fast.
For inspection only - do not modify directly!
"""
return self.fast

@property
def disk(self) -> Mapping[Hashable, Any]:
def disk(self) -> Mapping[str, Any]:
"""Key/value pairs spilled out to disk. Alias of zict.Buffer.slow.
For inspection only - do not modify directly!
"""
return self.slow

@staticmethod
def _weight(key: Hashable, value: Any) -> int:
return safe_sizeof(value)

def _on_evict(self, key: Hashable, value: Any) -> None:
b = safe_sizeof(value)
self.spilled_by_key[key] = b
self.spilled_total += b

def _on_retrieve(self, key: Hashable, value: Any) -> None:
self.spilled_total -= self.spilled_by_key.pop(key)

def __setitem__(self, key: Hashable, value: Any) -> None:
self.spilled_total -= self.spilled_by_key.pop(key, 0)
super().__setitem__(key, value)
if key in self.slow:
# value is individually larger than target so it went directly to slow.
# _on_evict was not called.
b = safe_sizeof(value)
self.spilled_by_key[key] = b
self.spilled_total += b

def __delitem__(self, key: Hashable) -> None:
self.spilled_total -= self.spilled_by_key.pop(key, 0)
@property
def spilled_total(self) -> int:
"""Number of bytes spilled to disk.
Note that this is the pickled size, which may differ from the output of sizeof().
"""
return self.slow.total_weight


def _in_memory_weight(key: str, value: Any) -> int:
return safe_sizeof(value)


# Internal exceptions. These are never raised by SpillBuffer.
class MaxSpillExceeded(Exception):
pass


class PickleError(Exception):
pass


class HandledError(Exception):
pass


class Slow(zict.Func):
max_weight: int | Literal[False]
weight_by_key: dict[str, int]
total_weight: int

def __init__(self, spill_directory: str, max_weight: int | Literal[False] = False):
super().__init__(
partial(serialize_bytelist, on_error="raise"),
deserialize_bytes,
zict.File(spill_directory),
)
self.max_weight = max_weight
self.weight_by_key = {}
self.total_weight = 0

def __setitem__(self, key: str, value: Any) -> None:
try:
pickled = self.dump(value)
except Exception as e:
# zict.LRU ensures that the key remains in fast if we raise.
# Wrap the exception so that it's recognizable by SpillBuffer,
# which will then unwrap it.
raise PickleError(key, e)

pickled_size = sum(len(frame) for frame in pickled)

# Thanks to Buffer.__setitem__, we never update existing keys in slow,
# but always delete them and reinsert them.
assert key not in self.d
assert key not in self.weight_by_key

if (
self.max_weight is not False
and self.total_weight + pickled_size > self.max_weight
):
# Stop callbacks and ensure that the key ends up in SpillBuffer.fast
# To be caught by SpillBuffer.__setitem__
raise MaxSpillExceeded(key)

# Store to disk through File.
# This may raise OSError, which is caught by SpillBuffer above.
self.d[key] = pickled

self.weight_by_key[key] = pickled_size
self.total_weight += pickled_size

def __delitem__(self, key: str) -> None:
super().__delitem__(key)
self.total_weight -= self.weight_by_key.pop(key)
Loading

0 comments on commit 561cb53

Please sign in to comment.