Skip to content

Commit

Permalink
clean up chunked backend
Browse files Browse the repository at this point in the history
  • Loading branch information
Johann Bahl committed Mar 7, 2024
1 parent dc323d3 commit 7d03b30
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 213 deletions.
4 changes: 0 additions & 4 deletions src/backy/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from structlog.stdlib import BoundLogger

if TYPE_CHECKING:
from backy.backup import Backup
from backy.revision import Revision


Expand All @@ -26,6 +25,3 @@ def purge(self) -> None:

def verify(self) -> None:
pass

def scrub(self, backup: "Backup", type: str) -> int:
return 0
37 changes: 3 additions & 34 deletions src/backy/backends/chunked/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from backy.utils import END, report_status

from .. import BackyBackend
from .chunk import Chunk
from .chunk import Chunk, Hash
from .file import File
from .store import Store

Expand Down Expand Up @@ -54,7 +54,7 @@ def open(self, mode: str = "rb") -> File: # type: ignore[override]

def purge(self) -> None:
self.log.debug("purge")
used_chunks: Set[str] = set()
used_chunks: Set[Hash] = set()
for revision in self.backup.history:
try:
used_chunks |= set(
Expand Down Expand Up @@ -90,7 +90,7 @@ def verify(self):
if candidate in verified_chunks:
continue
try:
c = Chunk(f, 0, self.store, candidate)
c = Chunk(self.store, candidate)
c._read_existing()
except Exception:
log.exception("verify-error", chunk=candidate)
Expand Down Expand Up @@ -124,34 +124,3 @@ def verify(self):

yield END
yield None

def scrub(self, backup, type: str) -> int:
if type == "light":
return self.scrub_light(backup)
elif type == "deep":
return self.scrub_deep(backup)
else:
raise RuntimeError("Invalid scrubbing type {}".format(type))

def scrub_light(self, backup) -> int:
errors = 0
self.log.info("scrub-light")
for revision in backup.history:
self.log.info("scrub-light-rev", revision_uuid=revision.uuid)
backend = type(self)(revision, self.log).open()
for hash in backend._mapping.values():
if backend.store.chunk_path(hash).exists():
continue
self.log.error(
"scrub-light-missing-chunk",
hash=hash,
revision_uuid=revision.uuid,
)
errors += 1
return errors

def scrub_deep(self, backup) -> int:
errors = self.scrub_light(backup)
self.log.info("scrub-deep")
errors += self.store.validate_chunks()
return errors
66 changes: 23 additions & 43 deletions src/backy/backends/chunked/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
import io
import os
import tempfile
import time
from typing import Optional
from typing import Optional, Tuple, TypeAlias

import lzo
import mmh3
Expand All @@ -12,6 +11,8 @@
from backy.backends import BackendException
from backy.utils import posix_fadvise

Hash: TypeAlias = str

chunk_stats = {
"write_full": 0,
"write_partial": 0,
Expand All @@ -34,38 +35,22 @@ class Chunk(object):

CHUNK_SIZE = 4 * 1024**2 # 4 MiB chunks

_read_existing_called = False # Test support

id: int
hash: str
file: "backy.backends.chunked.File"
hash: Optional[Hash]
store: "backy.backends.chunked.Store"
clean: bool
loaded: bool
data: Optional[io.BytesIO]

def __init__(self, file, id, store, hash):
self.id = id
def __init__(
self,
store: "backy.backends.chunked.Store",
hash: Optional[Hash],
):
self.hash = hash
self.file = file
self.store = store
self.clean = True
self.loaded = False

if self.id not in file._access_stats:
self.file._access_stats[id] = (0, 0)

self.data = None

def _cache_prio(self):
return self.file._access_stats[self.id]

def _touch(self):
count = self.file._access_stats[self.id][0]
self.file._access_stats[self.id] = (count + 1, time.time())

def _read_existing(self):
self._read_existing_called = True # Test support
def _read_existing(self) -> None:
if self.data is not None:
return
# Prepare working with the chunk. We keep the data in RAM for
Expand All @@ -88,17 +73,16 @@ def _read_existing(self):
raise InconsistentHash(self.hash, disk_hash)
self._init_data(data)

def _init_data(self, data):
def _init_data(self, data: bytes) -> None:
self.data = io.BytesIO(data)

def read(self, offset, size=-1):
def read(self, offset: int, size: int = -1) -> Tuple[bytes, int]:
"""Read data from the chunk.
Return the data and the remaining size that should be read.
"""
self._read_existing()
assert self.data is not None
self._touch()

self.data.seek(offset)
data = self.data.read(size)
Expand All @@ -107,15 +91,13 @@ def read(self, offset, size=-1):
remaining = max([0, size - len(data)])
return data, remaining

def write(self, offset, data):
def write(self, offset: int, data: bytes) -> Tuple[int, bytes]:
"""Write data to the chunk, returns
- the amount of data we used
- the _data_ remaining
"""
self._touch()

remaining_data = data[self.CHUNK_SIZE - offset :]
data = data[: self.CHUNK_SIZE - offset]

Expand All @@ -133,11 +115,16 @@ def write(self, offset, data):

return len(data), remaining_data

def flush(self):
def flush(self) -> Optional[Hash]:
"""Writes data to disk if necessary
Returns the new Hash on updates
"""
if self.clean:
return
return None
assert self.data is not None
self._update_hash()
# I'm not using read() here to a) avoid cache accounting and b)
# use a faster path to get the data.
self.hash = hash(self.data.getvalue())
target = self.store.chunk_path(self.hash)
needs_forced_write = (
self.store.force_writes and self.hash not in self.store.seen_forced
Expand All @@ -158,15 +145,8 @@ def flush(self):
self.store.seen_forced.add(self.hash)
self.store.known.add(self.hash)
self.clean = True

def _update_hash(self):
# I'm not using read() here to a) avoid cache accounting and b)
# use a faster path to get the data.
assert self.data is not None
data = self.data.getvalue()
self.hash = hash(data)
self.file._mapping[self.id] = self.hash
return self.hash


def hash(data):
def hash(data: bytes) -> Hash:
return binascii.hexlify(mmh3.hash_bytes(data)).decode("ascii")
Loading

0 comments on commit 7d03b30

Please sign in to comment.