Skip to content

Commit

Permalink
Make cache management more dynamic (#3578)
Browse files Browse the repository at this point in the history
* Make cache management more dynamic

PBENCH-1301

First, this adds code to track the unpacked size of tarballs to help with
managing cache goals. This is based on a `du -s -B1` of the unpacked directory
tree, and stored as metadata.

When asked to unpack, it will check whether there's sufficient space on the
cache device, and if not it will reclaim cache with a goal sufficient to
accommodate the target dataset.

The `pbench-tree-manage` command can now target either % free or bytes free,
and the background timer job will attempt to free 20% of the drive every 4
hours instead of targeting "old" tarballs. (The `last_ref` timestamp is now
used only to sort the list of datasets with live cache on input to reclaim
so that we free the oldest first.)
  • Loading branch information
dbutenhof authored Nov 29, 2023
1 parent fc77d24 commit e90872a
Show file tree
Hide file tree
Showing 7 changed files with 302 additions and 114 deletions.
30 changes: 26 additions & 4 deletions contrib/server/operations/pbench-upload-results.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@
from http import HTTPStatus
from io import TextIOWrapper
from pathlib import Path
import re
import socket
import sys
import time
from typing import Optional

import dateutil.parser
Expand Down Expand Up @@ -192,10 +194,17 @@ def main() -> int:
skipped = 0
early = 0
late = 0
timer = time.time()
if parsed.tarball.is_dir():
pool = parsed.tarball.glob("**/*.tar.xz")
which = []
for t in pool:
if time.time() >= timer + 5.0:
sel = len(which)
print(
f"[{early + late + skipped + sel} examined, {sel} selected, {skipped} skipped]"
)
timer = time.time()
if not t.is_file():
continue
date = t.stat().st_mtime
Expand Down Expand Up @@ -260,10 +269,23 @@ def main() -> int:
else:
failure += 1
failures.add(response.status_code)
try:
message = response.json()
except Exception:
message = response.text

# TODO: can we handle NGINX's ugly 500 "storage error"
# gracefully somehow?
if response.headers["content-type"] in (
"text/html",
"text/xml",
"application/xml",
"text/plain",
):
message = re.sub(r"[\n\s]+", " ", response.text)
elif response.headers["content-type"] == "application/json":
try:
message = response.json()
except Exception:
message = response.text
else:
message = f"{response.headers['content-type']}({response.text})"
print(
f"Upload of {t} failed: {response.status_code} ({message})",
file=sys.stderr,
Expand Down
138 changes: 34 additions & 104 deletions lib/pbench/cli/server/tree_manage.py
Original file line number Diff line number Diff line change
@@ -1,103 +1,15 @@
from datetime import datetime, timedelta, timezone
import errno
from logging import Logger
import datetime

import click
import humanfriendly
import humanize

from pbench.cli import pass_cli_context
from pbench.cli.server import config_setup
from pbench.cli.server.options import common_options
from pbench.common.logger import get_pbench_logger
from pbench.server import BadConfig, OperationCode
from pbench.server.cache_manager import CacheManager, LockManager
from pbench.server.database.models.audit import Audit, AuditStatus, AuditType

# Length of time in hours to retain unreferenced cached results data.
# TODO: this could become a configurable setting?
CACHE_LIFETIME = 4.0


def reclaim_cache(tree: CacheManager, logger: Logger, lifetime: float = CACHE_LIFETIME):
"""Reclaim unused caches
Args:
tree: the cache manager instance
lifetime: number of hours to retain unused cache data
logger: a Logger object
"""
window = datetime.now(timezone.utc) - timedelta(hours=lifetime)
total_count = 0
has_cache = 0
reclaimed = 0
reclaim_failed = 0
for tarball in tree.datasets.values():
total_count += 1
if tarball.unpacked:
has_cache += 1
date = datetime.fromtimestamp(
tarball.last_ref.stat().st_mtime, timezone.utc
)
if date >= window:
continue
error = None
audit = None
logger.info(
"RECLAIM {}: last_ref {:%Y-%m-%d %H:%M:%S} is older than {:%Y-%m-%d %H:%M:%S}",
tarball.name,
date,
window,
)
try:
with LockManager(tarball.lock, exclusive=True, wait=False):
try:
audit = Audit.create(
name="reclaim",
operation=OperationCode.DELETE,
status=AuditStatus.BEGIN,
user_name=Audit.BACKGROUND_USER,
object_type=AuditType.DATASET,
object_id=tarball.resource_id,
object_name=tarball.name,
)
except Exception as e:
logger.warn(
"Unable to audit cache reclaim for {}: '{}'",
tarball.name,
e,
)
tarball.cache_delete()
reclaimed += 1
except OSError as e:
if e.errno in (errno.EAGAIN, errno.EACCES):
logger.info(
"RECLAIM {}: skipping because cache is locked",
tarball.name,
)
# If the cache is locked, regardless of age, then
# the last_ref timestamp is about to be updated,
# and we skip the dataset this time around.
continue
error = e
except Exception as e:
error = e
attributes = {"last_ref": f"{date:%Y-%m-%d %H:%M:%S}"}
if error:
reclaim_failed += 1
logger.error("RECLAIM {} failed with '{}'", tarball.name, error)
attributes["error"] = str(error)
if audit:
Audit.create(
root=audit,
status=AuditStatus.FAILURE if error else AuditStatus.SUCCESS,
attributes=attributes,
)
logger.info(
"RECLAIM summary: {} datasets, {} had cache: {} reclaimed and {} errors",
total_count,
has_cache,
reclaimed,
reclaim_failed,
)
from pbench.server import BadConfig
from pbench.server.cache_manager import CacheManager


def print_tree(tree: CacheManager):
Expand All @@ -116,8 +28,8 @@ def print_tree(tree: CacheManager):
for tarball in tree.datasets.values():
print(f" {tarball.name}")
if tarball.unpacked:
date = datetime.fromtimestamp(
tarball.last_ref.stat().st_mtime, timezone.utc
date = datetime.datetime.fromtimestamp(
tarball.last_ref.stat().st_mtime, datetime.timezone.utc
)
print(f" Inventory is cached, last referenced {date:%Y-%m-%d %H:%M:%S}")

Expand All @@ -134,15 +46,22 @@ def print_tree(tree: CacheManager):
"--display", default=False, is_flag=True, help="Display the full tree on completion"
)
@click.option(
"--reclaim",
"--reclaim-percent",
show_default=True,
is_flag=False,
flag_value=CACHE_LIFETIME,
flag_value=20.0,
type=click.FLOAT,
help="Reclaim cached data older than <n> hours",
help="Reclaim cached data to maintain a target % free space",
)
@click.option(
"--reclaim-size",
is_flag=False,
help="Reclaim cached data to maintain specified free space",
)
@common_options
def tree_manage(context: object, display: bool, reclaim: float):
def tree_manage(
context: object, display: bool, reclaim_percent: float, reclaim_size: str
):
"""
Discover, display, and manipulate the on-disk representation of controllers
and datasets.
Expand All @@ -156,20 +75,31 @@ def tree_manage(context: object, display: bool, reclaim: float):
context: Click context (contains shared `--config` value)
display: Print a simplified representation of the hierarchy
lifetime: Number of hours to retain unused cache before reclaim
reclaim: Reclaim stale cached data
reclaim-percent: Reclaim cached data to free specified % on drive
reclaim-size: Reclame cached data to free specified size on drive
"""
logger = None
try:
config = config_setup(context)
logger = get_pbench_logger("cachemanager", config)
cache_m = CacheManager(config, logger)
cache_m.full_discovery()
if display:
print_tree(cache_m)
if reclaim:
reclaim_cache(cache_m, logger, reclaim)
rv = 0
rv = 0
if reclaim_percent or reclaim_size:
target_size = humanfriendly.parse_size(reclaim_size) if reclaim_size else 0
target_pct = reclaim_percent if reclaim_percent else 20.0
click.echo(
f"Reclaiming {target_pct}% or {humanize.naturalsize(target_size)}"
)
outcome = cache_m.reclaim_cache(goal_pct=target_pct, goal_bytes=target_size)
un = "" if outcome else "un"
click.echo(f"The cache manager was {un}able to free the requested space")
rv = 0 if outcome else 1
except Exception as exc:
logger.exception("An error occurred discovering the file tree: {}", exc)
if logger:
logger.exception("An error occurred discovering the file tree: {}", exc)
click.echo(exc, err=True)
rv = 2 if isinstance(exc, BadConfig) else 1

Expand Down
Loading

0 comments on commit e90872a

Please sign in to comment.