Skip to content

Commit

Permalink
Merge pull request #499 from kids-first/master-validation
Browse files Browse the repository at this point in the history
✨ Data Validation
  • Loading branch information
znatty22 authored Oct 9, 2020
2 parents 263d1bf + be25568 commit 20ac772
Show file tree
Hide file tree
Showing 95 changed files with 3,153 additions and 646 deletions.
12 changes: 0 additions & 12 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,6 @@ jobs:
steps:
- checkout

# Download and cache dependencies
- restore_cache:
keys:
- v1-dependencies-{{ checksum "requirements.txt" }}
# fallback to using the latest cache if no exact match is found
- v1-dependencies-

- run:
name: test package installation
command: |
Expand All @@ -40,11 +33,6 @@ jobs:
name: build lib and docs
command: ./scripts/build.sh

- save_cache:
paths:
- ./venv
key: v1-dependencies-{{ checksum "requirements.txt" }}

- run:
name: run tests
command: ./scripts/test.sh
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,5 @@ test_graph.gml
# dot files and folders
.*

# validation results
validation_results/
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ moto==1.3.14
requests-mock==1.8.0
flake8
black
deepdiff
-r requirements.txt
107 changes: 94 additions & 13 deletions kf_lib_data_ingest/app/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@

from kf_lib_data_ingest.app import settings
from kf_lib_data_ingest.config import DEFAULT_LOG_LEVEL, DEFAULT_TARGET_URL
from kf_lib_data_ingest.common.stage import (
BASIC_VALIDATION,
ADVANCED_VALIDATION,
)
from kf_lib_data_ingest.etl.ingest_pipeline import (
DEFAULT_STAGES_TO_RUN_STR,
VALID_STAGES_TO_RUN_STRS,
Expand All @@ -18,6 +22,29 @@

CONTEXT_SETTINGS = {"help_option_names": ["-h", "--help"]}
DEFAULT_LOG_LEVEL_NAME = logging._levelToName.get(DEFAULT_LOG_LEVEL)
DEFAULT_VALIDATION_MODE = ADVANCED_VALIDATION
VALIDATION_MODE_OPT = {
"args": ("--validation_mode",),
"kwargs": {
"default": DEFAULT_VALIDATION_MODE,
"type": click.Choice([BASIC_VALIDATION, ADVANCED_VALIDATION]),
"help": (
"Does not apply if --no_validate CLI flag is present. "
f"The `{BASIC_VALIDATION}` mode runs validation faster but is not "
f"as thorough. The {ADVANCED_VALIDATION} mode takes into account "
"implied relationships in the data and is able to resolve "
"ambiguities or report the ambiguities if they cannot be resolved."
"\nFor example, you have a file that relates participants and "
"specimens, and a file that relates participants and genomic files."
"This means your specimens have implied connections to their "
f"genomic files through the participants. In {ADVANCED_VALIDATION}"
"mode, the validator may be able to resolve these implied "
f"connections and report that all specimens are validly linked to "
f"genomic files. In {BASIC_VALIDATION} mode, the validator will "
"report that all specimens are missing links to genomic files."
),
},
}


def common_args_options(func):
Expand Down Expand Up @@ -114,6 +141,19 @@ def common_args_options(func):
help=log_help_txt,
)(func)

# Disable data validation
func = click.option(
"--no_validate",
default=False,
is_flag=True,
help="A flag to skip data validation during ingestion",
)(func)

# Validation mode
func = click.option(
*VALIDATION_MODE_OPT["args"], **VALIDATION_MODE_OPT["kwargs"]
)(func)

return func


Expand Down Expand Up @@ -147,6 +187,8 @@ def ingest(
dry_run,
resume_from,
no_warehouse,
no_validate,
validation_mode,
):
"""
Run the Kids First data ingest pipeline.
Expand Down Expand Up @@ -175,9 +217,12 @@ def ingest(
else:
app_settings = settings.load()

if kwargs.pop("no_warehouse", None):
if kwargs.pop("no_warehouse"):
os.environ[app_settings.SECRETS.WAREHOUSE_DB_URL] = ""

if kwargs.pop("no_validate"):
kwargs["validation_mode"] = None

kwargs.pop("app_settings_filepath", None)
kwargs["auth_configs"] = app_settings.AUTH_CONFIGS
kwargs["db_url_env_key"] = app_settings.SECRETS.WAREHOUSE_DB_URL
Expand All @@ -192,17 +237,7 @@ def ingest(
f'starting in "{app_settings.APP_MODE}" mode'
)

perfection = pipeline.run()

logger = logging.getLogger(__name__)
if perfection:
logger.info("✅ Ingest pipeline passed validation!")
else:
logger.error(
"❌ Ingest pipeline failed validation! "
f"See {pipeline.log_file_path} for details"
)
sys.exit(1)
pipeline.run()


@cli.command()
Expand All @@ -218,6 +253,8 @@ def test(
use_async,
resume_from,
no_warehouse,
no_validate,
validation_mode,
):
"""
Run the Kids First data ingest pipeline in dry_run mode (--dry_run=True)
Expand All @@ -234,7 +271,6 @@ def test(
file or a path to a directory which contains a file called
`ingest_package_config_path.py`
"""

# Make kwargs from options
frame = inspect.currentframe()
args, _, _, values = inspect.getargvalues(frame)
Expand Down Expand Up @@ -262,6 +298,51 @@ def create_new_ingest(dest_dir=None):
new_ingest_pkg(dest_dir)


@click.command()
@click.argument(
"file_or_dir",
type=click.Path(exists=True, file_okay=True, dir_okay=True),
)
@click.option(*VALIDATION_MODE_OPT["args"], **VALIDATION_MODE_OPT["kwargs"])
def validate(file_or_dir, validation_mode=DEFAULT_VALIDATION_MODE):
"""
Validate files and write validation reports to
a subdirectory, `validation_results`, in the current working directory
\b
Arguments:
\b
file_or_dir - the path to the file or directory of files to validate
"""
from kf_lib_data_ingest.common.io import path_to_file_list
from kf_lib_data_ingest.validation.validation import Validator

success = False
v = Validator(
output_dir=os.path.abspath(
os.path.join(os.path.dirname(file_or_dir), "validation_results")
)
)
try:
if validation_mode == BASIC_VALIDATION:
include_implicit = False
else:
include_implicit = True

success = v.validate(
path_to_file_list(file_or_dir), include_implicit=include_implicit
)
except Exception as e:
v.logger.exception(str(e))

if success:
v.logger.info("✅ Data validation passed!")
else:
v.logger.error("❌ Data validation failed!")
sys.exit(1)


cli.add_command(ingest)
cli.add_command(test)
cli.add_command(create_new_ingest)
cli.add_command(validate)
10 changes: 10 additions & 0 deletions kf_lib_data_ingest/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,15 @@ class FORMAT:
VCF = "vcf"


class READ_GROUP:
class QUALITY_SCALE:
ILLUMINA13 = "Illumina13"
ILLUMINA15 = "Illumina15"
ILLUMINA18 = "Illumina18"
SANGER = "Sanger"
SOLEXA = "Solexa"


class SEQUENCING:
class REFERENCE_GENOME:
GRCH38 = "GRCh38"
Expand Down Expand Up @@ -230,6 +239,7 @@ class STRATEGY:
RNA = "RNA-Seq"
WGS = "WGS"
WXS = "WXS"
TARGETED = "Targeted Sequencing"

class ANALYTE:
DNA = "DNA"
Expand Down
22 changes: 22 additions & 0 deletions kf_lib_data_ingest/common/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,25 @@ def write_json(data, filepath, use_jsonpickle=True, **kwargs):
if use_jsonpickle:
data = json.loads(jsonpickle.encode(data, keys=True))
json.dump(data, json_file, **kwargs)


def path_to_file_list(file_or_dir, recursive=True):
"""
Convert input which is either a file or a directory to a list of filepaths.
:param file_or_dir: path to a file or directory
:return: a list of filepaths
"""
file_or_dir = os.path.abspath(os.path.expanduser(file_or_dir))
files = []
if os.path.isdir(file_or_dir):
if recursive:
for root, _, file_list in os.walk(file_or_dir):
for file in file_list:
files.append(os.path.join(root, file))
else:
root, _, file_list = next(os.walk(file_or_dir))
files.extend([os.path.join(root, f) for f in file_list])
else:
files.append(file_or_dir)
return files
Loading

0 comments on commit 20ac772

Please sign in to comment.