Skip to content

Commit

Permalink
improve delete dataset
Browse files Browse the repository at this point in the history
use a single statement to delete all catalogue entries instead of a loop. Depente the dataset instead of leaving it empty if all entries are removed.
  • Loading branch information
garciampred committed Dec 20, 2024
1 parent bdfc9f5 commit 0beb4ca
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 12 deletions.
42 changes: 30 additions & 12 deletions cdsobs/cli/_delete_dataset.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
from pathlib import Path

import dask
import sqlalchemy.orm
import typer
from click import prompt
from fastapi.encoders import jsonable_encoder
from rich.console import Console
from sqlalchemy import delete, func, select

from cdsobs.cli._utils import (
config_yml_typer,
list_parser,
)
from cdsobs.config import CDSObsConfig
from cdsobs.observation_catalogue.database import get_session
from cdsobs.observation_catalogue.models import Catalogue
from cdsobs.observation_catalogue.repositories.cads_dataset import CadsDatasetRepository
from cdsobs.observation_catalogue.repositories.catalogue import CatalogueRepository
from cdsobs.observation_catalogue.schemas.catalogue import (
CatalogueSchema,
Expand All @@ -26,7 +30,9 @@
def delete_dataset(
cdsobs_config_yml: Path = config_yml_typer,
dataset: str = typer.Option(..., help="dataset to delete", prompt=True),
dataset_source: str = typer.Option("", help="dataset source to delete"),
dataset_source: str = typer.Option(
None, help="dataset source to delete. By default it will delete all."
),
time: str = typer.Option(
"",
help="Filter by an exact date or by an interval of two dates. For example: "
Expand All @@ -35,7 +41,7 @@ def delete_dataset(
):
"""Permanently delete the given dataset from the catalogue and the storage."""
confirm = prompt(
"This will delete the dataset permanently."
"This will delete the data permanently."
" This action cannot be undone. "
"Please type again the name of the dataset to confirm"
)
Expand All @@ -55,8 +61,15 @@ def delete_dataset(
except (Exception, KeyboardInterrupt):
catalogue_rollback(catalogue_session, deleted_entries)
raise
if len(deleted_entries):
console.print(f"[bold green] Dataset {dataset} deleted. [/bold green]")
nd = len(deleted_entries)
console.print(f"[bold green] {nd} entries deleted from {dataset}. [/bold green]")
nremaining = catalogue_session.scalar(select(func.count()).select_from(Catalogue))
if nremaining == 0:
CadsDatasetRepository(catalogue_session).delete_dataset(dataset)
console.print(
f"[bold green] Deleted {dataset} from datasets table as it was left empty. "
f"[/bold green]"
)


def delete_from_catalogue(
Expand All @@ -78,22 +91,27 @@ def delete_from_catalogue(
entries = catalogue_repo.get_by_filters(filters)
if not len(entries):
console.print(f"[red] No entries for dataset {dataset} found")
deleted_entries = []
try:
for entry in entries:
catalogue_repo.remove(record_id=entry.id)
deleted_entries.append(entry)
catalogue_session.execute(delete(Catalogue).where(*filters))
catalogue_session.commit()
except (Exception, KeyboardInterrupt):
catalogue_rollback(catalogue_session, deleted_entries)
return deleted_entries
catalogue_rollback(catalogue_session, entries)
return entries


def delete_from_s3(deleted_entries, s3_client):
assets = [e.asset for e in deleted_entries]
for asset in assets:
bucket, name = asset.split("/")

def delete_asset(asset_to_delete):
bucket, name = asset_to_delete.split("/")
s3_client.delete_file(bucket, name)

delayed_deletes = []
for asset in assets:
delayed_deletes.append(dask.delayed(delete_asset)(asset))

dask.compute(*delayed_deletes)


def catalogue_rollback(catalogue_session, deleted_entries):
schemas = [CatalogueSchema(**jsonable_encoder(e)) for e in deleted_entries]
Expand Down
4 changes: 4 additions & 0 deletions cdsobs/observation_catalogue/repositories/cads_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,7 @@ def get_dataset(self, dataset_name: str) -> CadsDataset | None:
return self.session.scalar(
sa.select(CadsDataset).filter(CadsDataset.name == dataset_name)
)

def delete_dataset(self, dataset_name: str):
dataset = self.get_dataset(dataset_name)
self.session.delete(dataset)

0 comments on commit 0beb4ca

Please sign in to comment.