Skip to content

Commit

Permalink
Add CLI options to locate-data-objects to allow skipping searches
Browse files Browse the repository at this point in the history
Runs may appear in the ML warehouse before they appear in iRODS. This
change allows queries against iRODS to be skipped if there is evidence
that this is the case.
  • Loading branch information
kjsanger committed Jul 4, 2023
1 parent 94a8b70 commit 18ea0db
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 2 deletions.
52 changes: 50 additions & 2 deletions scripts/locate-data-objects
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# @author Keith James <kdj@sanger.ac.uk>

import argparse
from collections import defaultdict
from datetime import datetime, timedelta, timezone

import sqlalchemy
Expand All @@ -27,7 +28,12 @@ from partisan.irods import AVU, query_metadata
from sqlalchemy.orm import Session

from npg_irods import illumina, ont
from npg_irods.cli import add_logging_arguments, configure_logging, parse_iso_date
from npg_irods.cli import (
add_logging_arguments,
configure_logging,
integer_in_range,
parse_iso_date,
)
from npg_irods.db import DBConfig
from npg_irods.db.mlwh import find_consent_withdrawn_samples
from npg_irods.metadata.common import SeqConcept
Expand Down Expand Up @@ -122,14 +128,34 @@ ilup_parser.add_argument(
type=parse_iso_date,
default=datetime.now(timezone.utc) - timedelta(days=14),
)
ilup_parser.add_argument(
"--skip-absent-runs",
"--skip_absent_runs",
help="Skip runs that cannot be found in iRODS after multiple attempts.",
action="store_true",
)
ilup_parser.add_argument(
"--skip-threshold",
"--skip_threshold",
help="Skip runs after this many attempts. Defaults to 10, values must be in the "
"range 1-10.",
type=integer_in_range(1, 10),
default=10,
)


def illumina_updates(cli_args):
dbconfig = DBConfig.from_file(cli_args.database_config.name, "mlwh_ro")
engine = sqlalchemy.create_engine(dbconfig.url)
with Session(engine) as session:
num_processed = num_errors = 0

iso_date = cli_args.begin_date.strftime("%Y-%m-%dT%H:%M:%SZ")
skip_absent_runs = cli_args.skip_absent_runs
skip_threshold = cli_args.skip_threshold

attempts_per_run = defaultdict(int)
success_per_run = defaultdict(int)

for i, c in enumerate(
illumina.find_components_changed(session, since=cli_args.begin_date)
Expand All @@ -143,7 +169,29 @@ def illumina_updates(cli_args):
AVU(Instrument.LANE, c.position),
AVU(SeqConcept.TAG_INDEX, c.tag_index),
]
for obj in query_metadata(*avus, collection=False, zone=cli_args.zone):

if (
skip_absent_runs
and success_per_run[c.id_run] == 0
and attempts_per_run[c.id_run] > skip_threshold
):
log.info(
"Skipping run after unsuccessful attempts to find it",
item=i,
component=c,
since=iso_date,
attempts=skip_threshold,
)
continue

result = query_metadata(*avus, collection=False, zone=cli_args.zone)
if not result:
success_per_run[c.id_run] += 1
continue

attempts_per_run[c.id_run] += 1

for obj in result:
print(obj)

except Exception as e:
Expand Down
18 changes: 18 additions & 0 deletions src/npg_irods/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,3 +194,21 @@ def parse_iso_date(date: str) -> datetime:
raise argparse.ArgumentTypeError(
f"Incorrect format {date}. Please use ISO8601 UTC e.g. 2022-01-30T11:11:03Z"
)


def integer_in_range(minimum: int, maximum: int):
"""Custom argparse type for integers in a range."""

def check_range(value: str) -> int:
try:
val = int(value)
except ValueError:
raise argparse.ArgumentTypeError(f"Value {value} is not an integer")

if val < minimum or val > maximum:
raise argparse.ArgumentTypeError(
f"Value {val} is not in range {minimum} to {maximum}"
)
return val

return check_range

0 comments on commit 18ea0db

Please sign in to comment.