Add simple report generator

This will report on the state of the ARCHIVE, BACKUP, and CACHE on-disk trees in addition to the state of the SQL database. (I'm going to leave analyzing and reporting on the Opensearch database for another time, since this is "off books" weekend upstream work!) This packages the ad hoc SQL queries I've been doing to monitor the server as a CLI utility, plus some more. Here's the output of `pbench-report-generator --all` on the production server: ``` Archive report: 117286 tarballs: 21.5 TB The smallest tarball, pbench-user-benchmark__2020.04.03T11.05.44, is 1.0 kB The biggest tarball, uperf_osp16_1_ml2ovs_25g_ew_2020.11.16T08.05.28, is 26.1 GB Backup report: 117286 tarballs are backed up, consuming 21.5 TB Cache report: 103904 datasets are cached, consuming 44.9 TB 8 datasets have never been unpacked, 3 are missing reference timestamps, 0 have bad size metadata The smallest cache, pbench-user-benchmark__2020.04.03T11.05.44, is 24.6 kB The biggest cache, trafficgen_RHOSP16.2-RHEL8.3-nrt-OVS-OFFLOAD-PVP-LossTests_tg:trex_r:none_fs:64,128,256,512,1024,1500_nf:1024_fm:si_td:bi_ml:0.002,0.0005,0.0001_tt:bs__2020-12-26T03:16:38, is 110.5 GB The least recently used cache, uperf__2023.12.02T00.33.06, was referenced Dec 07 The most recently used cache, uperf_tuned_virtual-guest_sys_file_none_2020.06.11T10.37.30, was referenced today Operational states: UPLOAD states: OK 117285 TOOLINDEX states: READY 103561 INDEX states: OK 103561 FAILED 376 READY 13324 SQL storage report: Table Rows Storage -------------------- ---------- ---------- alembic_version 1 57.3 kB audit 672249 221.8 MB datasets 117285 34.3 MB templates 12 221.2 kB server_settings 0 24.6 kB users 10 81.9 kB dataset_metadata 351852 217.6 MB dataset_operations 338107 28.9 MB api_keys 5 81.9 kB indexmaps 283670 74.5 GB ```
distributed-system-analysis · Jan 21, 2024 · 65ff9ff · 65ff9ff
1 parent 58acfe9
commit 65ff9ff
Show file tree

Hide file tree

Showing 4 changed files with 387 additions and 1 deletion.
diff --git a/lib/pbench/cli/server/report.py b/lib/pbench/cli/server/report.py
@@ -0,0 +1,381 @@
+from collections import defaultdict
+import datetime
+from typing import Optional
+
+import click
+import humanize
+from sqlalchemy import inspect, select, text
+
+from pbench.cli import pass_cli_context
+from pbench.cli.server import config_setup
+from pbench.cli.server.options import common_options
+from pbench.client.types import Dataset
+from pbench.common.logger import get_pbench_logger
+from pbench.server import BadConfig
+from pbench.server.cache_manager import CacheManager
+from pbench.server.database.database import Database
+from pbench.server.database.models.datasets import Metadata
+from pbench.server.database.models.index_map import IndexMap
+
+
+class Detail:
+    """Encapsulate generation of additional diagnostics"""
+
+    def __init__(self, detail: bool):
+        self.detail = detail
+
+    def __bool__(self) -> bool:
+        return self.detail
+
+    def write(self, message: str):
+        if self.detail:
+            click.echo(f"|| {message}")
+
+
+class Verify:
+    """Encapsulate -v status messages."""
+
+    def __init__(self, verify: bool):
+        self.verify = verify
+
+    def __bool__(self) -> bool:
+        return self.verify
+
+    def status(self, message: str):
+        if self.verify:
+            ts = datetime.datetime.now()
+            click.echo(f"({ts:%H:%M:%S}) {message}")
+
+
+class Watch:
+    """Encapsulate a periodic status check.
+
+    Discovery (especially for cache and backup) can take a long time, so we
+    centralize a periodic update notice mechanism.
+    """
+
+    def __init__(self, interval: float):
+        self.interval = datetime.timedelta(seconds=interval) if interval else None
+        self.start = datetime.datetime.now()
+        self.last = self.start
+
+    def update(self, status: str):
+        now = datetime.datetime.now()
+        if self.interval and now > self.last + self.interval:
+            self.last = now
+            delta = now - self.start
+            hours, remainder = divmod(delta.seconds, 3600)
+            minutes, seconds = divmod(remainder, 60)
+            click.echo(f"[{hours:02d}:{minutes:02d}:{seconds:02d}] {status}")
+
+
+detailer: Optional[Detail] = None
+watcher: Optional[Watch] = None
+verifier: Optional[Verify] = None
+
+
+def report_archive(tree: CacheManager):
+    """Report archive statistics.
+
+    Args:
+        tree: a cache instance
+    """
+
+    tarball_count = len(tree.datasets)
+    tarball_size = 0
+    smallest_tarball = 0
+    smallest_tarball_name = None
+    biggest_tarball = 0
+    biggest_tarball_name = None
+
+    for tarball in tree.datasets.values():
+        watcher.update(f"({tarball_count}) archive {tarball.name}")
+        size = tarball.tarball_path.stat().st_size
+        tarball_size += size
+        if not smallest_tarball or size < smallest_tarball:
+            smallest_tarball = size
+            smallest_tarball_name = tarball.name
+        if not biggest_tarball or size > biggest_tarball:
+            biggest_tarball = size
+            biggest_tarball_name = tarball.name
+    click.echo("Archive report:")
+    click.echo(
+        f"  {tarball_count:d} tarballs consuming {humanize.naturalsize(tarball_size)}"
+    )
+    click.echo(
+        f"  The smallest tarball, {smallest_tarball_name}, is "
+        f"{humanize.naturalsize(smallest_tarball)}"
+    )
+    click.echo(
+        f"  The biggest tarball, {biggest_tarball_name}, is "
+        f"{humanize.naturalsize(biggest_tarball)}"
+    )
+
+
+def report_backup(tree: CacheManager):
+    """Report tarball backup statistics.
+
+    Args:
+        tree: a cache instance
+    """
+
+    backup_count = 0
+    backup_size = 0
+    for tarball in tree.backup_root.glob("**/*.tar.xz"):
+        watcher.update(f"({backup_count}) backup {Dataset.stem(tarball)}")
+        backup_count += 1
+        backup_size += tarball.stat().st_size
+
+    click.echo("Backup report:")
+    click.echo(
+        f"  {backup_count} tarballs are backed up, consuming "
+        f"{humanize.naturalsize(backup_size)}"
+    )
+
+
+def report_cache(tree: CacheManager):
+    """Report cache statistics.
+
+    Args:
+        tree: a cache instance
+    """
+
+    cached_count = 0
+    cached_size = 0
+    lacks_size = 0
+    bad_size = 0
+    oldest_cache = None
+    oldest_cache_name = None
+    newest_cache = None
+    newest_cache_name = None
+    smallest_cache = 0
+    smallest_cache_name = None
+    biggest_cache = 0
+    biggest_cache_name = None
+    last_ref_errors = 0
+
+    for tarball in tree.datasets.values():
+        watcher.update(f"({cached_count}) cache {tarball.name}")
+        if tarball.unpacked:
+            try:
+                referenced = tarball.last_ref.stat().st_mtime
+            except Exception as e:
+                detailer.write(f"{tarball.name} last ref access: {str(e)!r}")
+                last_ref_errors += 1
+            else:
+                if not oldest_cache or referenced < oldest_cache:
+                    oldest_cache = referenced
+                    oldest_cache_name = tarball.name
+                if not newest_cache or referenced > newest_cache:
+                    newest_cache = referenced
+                    newest_cache_name = tarball.name
+            cached_count += 1
+            size = Metadata.getvalue(tarball.dataset, Metadata.SERVER_UNPACKED)
+            if not size:
+                detailer.write(f"{tarball.name} has no unpacked size")
+                lacks_size += 1
+            elif not isinstance(size, int):
+                detailer.write(
+                    f"{tarball.name} has non-integer unpacked size "
+                    f"{size!r} ({type(size)})"
+                )
+                bad_size += 1
+            else:
+                if not smallest_cache or size < smallest_cache:
+                    smallest_cache = size
+                    smallest_cache_name = tarball.name
+                if not biggest_cache or size > biggest_cache:
+                    biggest_cache = size
+                    biggest_cache_name = tarball.name
+                cached_size += size
+    oldest = datetime.datetime.fromtimestamp(oldest_cache, datetime.timezone.utc)
+    newest = datetime.datetime.fromtimestamp(newest_cache, datetime.timezone.utc)
+    click.echo("Cache report:")
+    click.echo(
+        f"  {cached_count} datasets are cached, consuming "
+        f"{humanize.naturalsize(cached_size)}"
+    )
+    click.echo(
+        f"  {lacks_size} datasets have never been unpacked, "
+        f"{last_ref_errors} are missing reference timestamps, "
+        f"{bad_size} have bad size metadata"
+    )
+    click.echo(
+        f"  The smallest cache, {smallest_cache_name}, is "
+        f"{humanize.naturalsize(smallest_cache)}"
+    )
+    click.echo(
+        f"  The biggest cache, {biggest_cache_name}, is "
+        f"{humanize.naturalsize(biggest_cache)}"
+    )
+    click.echo(
+        f"  The least recently used cache, {oldest_cache_name}, was "
+        f"referenced {humanize.naturaldate(oldest)}"
+    )
+    click.echo(
+        f"  The most recently used cache, {newest_cache_name}, was "
+        f"referenced {humanize.naturaldate(newest)}"
+    )
+
+
+def report_sql():
+    """Report the SQL table storage statistics"""
+    click.echo("SQL storage report:")
+    click.echo("  Table                Rows       Storage")
+    click.echo("  -------------------- ---------- ----------")
+    for t in inspect(Database.db_session.get_bind()).get_table_names():
+        rows = list(
+            Database.db_session.execute(statement=text(f"SELECT COUNT(*) FROM {t}"))
+        )[0][0]
+        size = list(
+            Database.db_session.execute(
+                statement=text("SELECT pg_total_relation_size(:table)"),
+                params={"table": t},
+            )
+        )[0][0]
+        click.echo(f"  {t:20} {rows:>10} {humanize.naturalsize(size):>10}")
+
+    if not detailer:
+        return
+
+    query = select(IndexMap.root, IndexMap.index)
+    idxes = Database.db_session.execute(query).all()
+    record_count = 0
+    roots = set()
+    indices = set()
+    root_size = 0
+    index_size = 0
+    for idx in idxes:
+        record_count += 1
+        roots.add(idx[0])
+        indices.add(idx[1])
+        root_size += len(idx[0])
+        index_size += len(idx[1])
+    unique_root_size = 0
+    unique_index_size = 0
+    for r in roots:
+        unique_root_size += len(r)
+    for i in indices:
+        unique_index_size += len(i)
+
+    detailer.write(
+        f"{record_count} indexmap records found with {len(indices)} indices "
+        f"and {len(roots)} roots:"
+    )
+    detailer.write(
+        f" {humanize.naturalsize(index_size)} for index names, "
+        f"{humanize.naturalsize(root_size)} for root names"
+    )
+    detailer.write(
+        f" deduped: {humanize.naturalsize(unique_index_size)} for index "
+        f"names, {humanize.naturalsize(unique_root_size)} for root names"
+    )
+
+
+def report_states():
+    """Report tarball operational states."""
+
+    operations = defaultdict(lambda: defaultdict(int))
+    rows = Database.db_session.execute(
+        statement=text(
+            "SELECT d.name, o.name, o.state, o.message FROM datasets AS d LEFT OUTER JOIN "
+            "dataset_operations AS o ON o.dataset_ref = d.id"
+        )
+    )
+    for row in rows:
+        operations[row[1]][row[2]] += 1
+        if row[2] == "FAILED":
+            detailer.write(f"{row[1]} {row[2]} {row[0]} {row[3]!r}")
+    click.echo("Operational states:")
+    for name, states in operations.items():
+        click.echo(f"  {name} states:")
+        for state, count in states.items():
+            click.echo(f"    {state:>8s} {count:>8d}")
+
+
+@click.command(name="pbench-report-generator")
+@pass_cli_context
+@click.option("--all", default=False, is_flag=True, help="Display full report")
+@click.option(
+    "--archive", default=False, is_flag=True, help="Display archive statistics"
+)
+@click.option("--backup", default=False, is_flag=True, help="Display backup statistics")
+@click.option("--cache", default=False, is_flag=True, help="Display cache statistics")
+@click.option(
+    "--detail", default=False, is_flag=True, help="Provide extra diagnostic information"
+)
+@click.option(
+    "--progress", type=float, default=0.0, help="Show periodic progress messages"
+)
+@click.option("--sql", default=False, is_flag=True, help="Display SQL statistics")
+@click.option(
+    "--states", default=False, is_flag=True, help="Display operational states"
+)
+@click.option(
+    "--verify", default=False, is_flag=True, help="Display intermediate messages"
+)
+@common_options
+def report(
+    context: object,
+    all: bool,
+    archive: bool,
+    backup: bool,
+    cache: bool,
+    detail: bool,
+    progress: float,
+    sql: bool,
+    states: bool,
+    verify: bool,
+):
+    """
+    Report statistics and problems in the SQL and on-disk representation of
+    Pbench datasets.
+    \f
+
+    Args:
+        context: click context
+        all: report all statistics
+        archive: report archive statistics
+        backup: report backup statistics
+        cache: report cache statistics
+        detail: provide additional per-file diagnostics
+        sql: report SQL statistics
+        states: report operational states
+        verify: Report internal status
+    """
+    logger = None
+
+    global detailer, verifier, watcher
+    detailer = Detail(detail)
+    verifier = Verify(verify)
+    watcher = Watch(progress)
+
+    try:
+        config = config_setup(context)
+        logger = get_pbench_logger("report-generator", config)
+        if any((all, archive, backup, cache)):
+            cache_m = CacheManager(config, logger)
+            verifier.status("starting discovery")
+            cache_m.full_discovery()
+            verifier.status("finished discovery")
+            if all or archive:
+                report_archive(cache_m)
+            if all or backup:
+                report_backup(cache_m)
+            if all or cache:
+                report_cache(cache_m)
+        if all or sql:
+            report_sql()
+        if all or states:
+            report_states()
+
+        rv = 0
+    except Exception as exc:
+        if logger:
+            logger.exception("An error occurred discovering the file tree: {}", exc)
+        if verify:
+            raise
+        click.echo(exc, err=True)
+        rv = 2 if isinstance(exc, BadConfig) else 1
+
+    click.get_current_context().exit(rv)
diff --git a/lib/pbench/server/cache_manager.py b/lib/pbench/server/cache_manager.py
@@ -1260,6 +1260,9 @@ def __init__(self, options: PbenchServerConfig, logger: Logger):
         # Record the root ARCHIVE directory path
         self.archive_root: Path = self.options.ARCHIVE
 
+        # Record the root BACKUP directory path
+        self.backup_root: Path = self.options.BACKUP
+
         # Record the root CACHE directory path
         self.cache_root: Path = self.options.CACHE
 

diff --git a/server/Makefile b/server/Makefile
@@ -20,6 +20,7 @@ INSTALL = install
 INSTALLOPTS = --directory
 
 click-scripts = \
+	pbench-report-generator \
 	pbench-tree-manage \
 	pbench-user-create \
 	pbench-user-update \