diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f9501e6..a6b1fd7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ default_install_hook_types: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v4.6.0 hooks: # - id: check-yaml - id: end-of-file-fixer @@ -25,14 +25,14 @@ repos: ## Black - repo: https://github.com/psf/black - rev: 24.2.0 + rev: 24.4.2 hooks: - id: black stages: [pre-commit] ## Ruff - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.3.0 + rev: v0.4.10 hooks: - id: ruff stages: [pre-commit] diff --git a/ckanext/files/base.py b/ckanext/files/base.py index 9c4b199..6925e05 100644 --- a/ckanext/files/base.py +++ b/ckanext/files/base.py @@ -27,7 +27,7 @@ import ckan.plugins.toolkit as tk from ckan.config.declaration import Declaration, Key -from . import config, exceptions, model, utils, types +from . import config, exceptions, model, types, utils class PFileModel(Protocol): diff --git a/ckanext/files/cli/__init__.py b/ckanext/files/cli/__init__.py index 176b6a4..d48e822 100644 --- a/ckanext/files/cli/__init__.py +++ b/ckanext/files/cli/__init__.py @@ -13,7 +13,7 @@ from ckanext.files import base, config, exceptions, shared from ckanext.files.model import File, Owner -from . import dev, migrate +from . import dev, maintain, migrate, stats __all__ = [ "files", @@ -27,6 +27,8 @@ def files(): files.add_command(dev.group, "dev") files.add_command(migrate.group, "migrate") +files.add_command(stats.group, "stats") +files.add_command(maintain.group, "maintain") @files.command() diff --git a/ckanext/files/cli/maintain.py b/ckanext/files/cli/maintain.py new file mode 100644 index 0000000..0c5063e --- /dev/null +++ b/ckanext/files/cli/maintain.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import click +import sqlalchemy as sa + +import ckan.plugins.toolkit as tk +from ckan import model + +from ckanext.files import shared, utils + + +def _now(): + return datetime.now(timezone.utc) + + +@click.group() +def group(): + """Storage maintenance.""" + + +storage_option = click.option( + "-s", + "--storage-name", + help="Name of the configured storage", +) + + +@group.command() +@storage_option +@click.option("--remove", is_flag=True, help="Remove files") +def empty_owner(storage_name: str | None, remove: bool): + """Manage files that have no owner.""" + storage_name = storage_name or shared.config.default_storage() + try: + storage = shared.get_storage(storage_name) + except shared.exc.UnknownStorageError as err: + tk.error_shout(err) + raise click.Abort from err + + if remove and not storage.supports(shared.Capability.REMOVE): + tk.error_shout(f"Storage {storage_name} does not support file removal") + raise click.Abort + + stmt = ( + sa.select(shared.File) + .outerjoin(shared.File.owner_info) + .where(shared.File.storage == storage_name, shared.Owner.owner_id.is_(None)) + ) + + total = model.Session.scalar(sa.select(sa.func.count()).select_from(stmt)) + if not total: + click.echo(f"Every file in storage {storage_name} has owner reference") + return + click.echo("Following files do not have owner reference") + + for file in model.Session.scalars(stmt): + size = utils.humanize_filesize(file.size) + click.echo(f"\t{file.id}: {file.name} [{file.content_type}, {size}]") + + if remove and click.confirm("Do you want to delete these files?"): + action = tk.get_action("files_file_delete") + + with click.progressbar(model.Session.scalars(stmt), length=total) as bar: + for file in bar: + action({"ignore_auth": True}, {"id": file.id}) + + +@group.command() +@storage_option +@click.option("--remove", is_flag=True, help="Remove files") +def invalid_owner(storage_name: str | None, remove: bool): + """Manage files that has suspicious owner reference.""" + storage_name = storage_name or shared.config.default_storage() + try: + storage = shared.get_storage(storage_name) + except shared.exc.UnknownStorageError as err: + tk.error_shout(err) + raise click.Abort from err + + if remove and not storage.supports(shared.Capability.REMOVE): + tk.error_shout(f"Storage {storage_name} does not support file removal") + raise click.Abort + + stmt = ( + sa.select(shared.File) + .join(shared.File.owner_info) + .where(shared.File.storage == storage_name) + ) + + files = [f for f in model.Session.scalars(stmt) if f.owner is None] + + if not files: + click.echo( + f"Every owned file in storage {storage_name} has valid owner reference", + ) + return + + click.echo("Following files have dangling owner reference") + for file in files: + size = utils.humanize_filesize(file.size) + click.echo( + "\t{}: {} [{}, {}]. Owner: {} {}".format( + file.id, + file.name, + file.content_type, + size, + file.owner_info.owner_type, + file.owner_info.owner_id, + ), + ) + + if remove and click.confirm("Do you want to delete these files?"): + action = tk.get_action("files_file_delete") + + with click.progressbar(files) as bar: + for file in bar: + action({"ignore_auth": True}, {"id": file.id}) + + +@group.command() +@storage_option +@click.option("--remove", is_flag=True, help="Remove files") +def missing_files(storage_name: str | None, remove: bool): + """Manage files do not exist in storage.""" + storage_name = storage_name or shared.config.default_storage() + try: + storage = shared.get_storage(storage_name) + except shared.exc.UnknownStorageError as err: + tk.error_shout(err) + raise click.Abort from err + + if not storage.supports(shared.Capability.EXISTS): + tk.error_shout( + f"Storage {storage_name} does not support file availability checks", + ) + raise click.Abort + + if remove and not storage.supports(shared.Capability.REMOVE): + tk.error_shout(f"Storage {storage_name} does not support file removal") + raise click.Abort + + stmt = sa.select(shared.File).where(shared.File.storage == storage_name) + total = model.Session.scalar(sa.select(sa.func.count()).select_from(stmt)) + missing: list[shared.File] = [] + with click.progressbar(model.Session.scalars(stmt), length=total) as bar: + for file in bar: + data = shared.FileData.from_model(file) + if not storage.exists(data): + missing.append(file) + + if not missing: + click.echo( + f"No missing files located in storage {storage_name}", + ) + return + + click.echo("Following files are not found in storage") + for file in missing: + size = utils.humanize_filesize(file.size) + click.echo( + f"\t{file.id}: {file.name} [{file.content_type}, {size}]", + ) + + if remove and click.confirm("Do you want to delete these files?"): + action = tk.get_action("files_file_delete") + + with click.progressbar(missing) as bar: + for file in bar: + action({"ignore_auth": True}, {"id": file.id}) diff --git a/ckanext/files/cli/stats.py b/ckanext/files/cli/stats.py new file mode 100644 index 0000000..939b14e --- /dev/null +++ b/ckanext/files/cli/stats.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +from datetime import datetime, timezone + +import click +import sqlalchemy as sa +from babel.dates import format_datetime, format_timedelta + +import ckan.plugins.toolkit as tk +from ckan import model + +from ckanext.files import shared, utils + + +def _now(): + return datetime.now(timezone.utc) + + +@click.group() +def group(): + """Storage statistics.""" + + +storage_option = click.option( + "-s", + "--storage-name", + help="Name of the configured storage", +) + + +@group.command() +@storage_option +def overview(storage_name: str | None): + """General information about storage usage.""" + + storage_name = storage_name or shared.config.default_storage() + stmt = sa.select( + sa.func.sum(shared.File.size), + sa.func.count(shared.File.id), + sa.func.max(shared.File.ctime), + sa.func.min(shared.File.ctime), + ).where(shared.File.storage == storage_name) + row = model.Session.execute(stmt).fetchone() + size, count, newest, oldest = row if row else (0, 0, _now(), _now()) + + if not count: + tk.error_shout("Storage is not configured or empty") + raise click.Abort + + click.secho(f"Number of files: {click.style(count, bold=True)}") + click.secho( + f"Used space: {click.style(utils.humanize_filesize(size), bold=True)}", + ) + click.secho( + "Newest file created at: {} ({})".format( + click.style(format_datetime(newest), bold=True), + format_timedelta(newest - _now(), add_direction=True), + ), + ) + click.secho( + "Oldest file created at: {} ({})".format( + click.style(format_datetime(oldest), bold=True), + format_timedelta(oldest - _now(), add_direction=True), + ), + ) + + +@group.command() +@storage_option +def types(storage_name: str | None): + """Files distribution by MIMEtype.""" + + storage_name = storage_name or shared.config.default_storage() + stmt = ( + sa.select( + shared.File.content_type, + sa.func.count(shared.File.content_type).label("count"), + ) + .where(shared.File.storage == storage_name) + .group_by(shared.File.content_type) + .order_by(shared.File.content_type) + ) + + total = model.Session.scalar(sa.select(sa.func.sum(stmt.c.count))) + click.secho( + "Storage {} contains {} files".format( + click.style(storage_name, bold=True), + click.style(total, bold=True), + ), + ) + for content_type, count in model.Session.execute(stmt): + click.secho(f"\t{content_type}: {click.style(count, bold=True)}") + + +@group.command() +@storage_option +@click.option( + "-v", + "--verbose", + is_flag=True, + help="Show distribution for every owner ID", +) +def owner(storage_name: str | None, verbose: bool): + """Files distribution by owner.""" + + storage_name = storage_name or shared.config.default_storage() + owner_col = ( + sa.func.concat(shared.Owner.owner_type, " ", shared.Owner.owner_id) + if verbose + else sa.func.concat(shared.Owner.owner_type, "") + ) + + stmt = ( + sa.select( + owner_col.label("owner"), + sa.func.count(shared.File.id), + ) + .where(shared.File.storage == storage_name) + .outerjoin( + shared.Owner, + sa.and_( + shared.Owner.item_id == shared.File.id, + shared.Owner.item_type == "file", + ), + ) + .group_by(owner_col) + ).order_by(owner_col) + + total = model.Session.scalar(sa.select(sa.func.sum(stmt.c.count))) + click.secho( + "Storage {} contains {} files".format( + click.style(storage_name, bold=True), + click.style(total, bold=True), + ), + ) + for owner, count in model.Session.execute(stmt): + click.secho( + "\t{}: {}".format( + owner.strip() or click.style("has no owner", underline=True, bold=True), + click.style(count, bold=True), + ), + ) diff --git a/ckanext/files/shared.py b/ckanext/files/shared.py index 3c7edfd..17c31c7 100644 --- a/ckanext/files/shared.py +++ b/ckanext/files/shared.py @@ -1,6 +1,6 @@ -from . import types from . import config from . import exceptions as exc +from . import types from .base import ( FileData, Manager, diff --git a/ckanext/files/types.py b/ckanext/files/types.py index 88feb51..1285403 100644 --- a/ckanext/files/types.py +++ b/ckanext/files/types.py @@ -22,8 +22,7 @@ class UploadStream(Protocol): def read(self, size: Any = ..., /) -> bytes: ... - def __iter__(self) -> Iterator[bytes]: - ... + def __iter__(self) -> Iterator[bytes]: ... __all__ = [ diff --git a/ckanext/files/utils.py b/ckanext/files/utils.py index 6929f0c..43daee1 100644 --- a/ckanext/files/utils.py +++ b/ckanext/files/utils.py @@ -28,6 +28,7 @@ import ckan.plugins.toolkit as tk from ckan import model from ckan.lib.api_token import _get_algorithm, _get_secret # type: ignore + from ckanext.files import types log = logging.getLogger(__name__)