Skip to content

Commit

Permalink
Moving and testing ingest scripts from variantstore (#6881)
Browse files Browse the repository at this point in the history
* Copying files over from variantstore

* some changes

* forgot to turn the test back on

* fixing test

* addressing comments
  • Loading branch information
meganshand authored and Marianie-Simeon committed Feb 16, 2021
1 parent 7824192 commit 19dd991
Show file tree
Hide file tree
Showing 7 changed files with 405 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@
"ImportArrayManifest.manifest_schema_json":"/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/manifest_schema.json",
"ImportArrayManifest.project_id":"broad-dsde-dev",
"ImportArrayManifest.dataset_name":"temp_tables",
"ImportArrayManifest.table_name": "__TABLE_NAME__",
"ImportArrayManifest.table_name": "__TABLE_NAME___probe_id",
"ImportArrayManifest.LoadManifest.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS"
}
16 changes: 16 additions & 0 deletions scripts/variantstore_cromwell_tests/import_arrays_test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"ImportArrays.output_directory":"gs://variantstore-test/__UUID__",
"ImportArrays.input_vcfs":["/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/array.vcf"],
"ImportArrays.probe_info_file":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/expected_probe_info.csv",
"ImportArrays.sample_map":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/sampleMap.csv",
"ImportArrays.sample_list_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json",
"ImportArrays.raw_schema": "/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/raw_array_schema.json",
"ImportArrays.table_id": 1,
"ImportArrays.project_id": "broad-dsde-dev",
"ImportArrays.dataset_name": "temp_tables",
"ImportArrays.docker": "__GATK_DOCKER__",
"ImportArrays.CreateImportTsvs.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS",
"ImportArrays.LoadArrays.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS",
"ImportArrays.LoadArrays.load": "true",
"ImportArrays.LoadArrays.uuid": "__UUID__"
}
16 changes: 12 additions & 4 deletions scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,23 @@ else
fi
echo "Docker build done =========="
echo "Putting the newly built docker image into the json parameters"
cd $WORKING_DIR/gatk/scripts/
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" variantstore_cromwell_tests/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json
CROMWELL_TEST_DIR="${WORKING_DIR}/gatk/scripts/variantstore_cromwell_tests"
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/import_arrays_test.json >$WORKING_DIR/import_arrays_test_tmp.json
sed -r "s/__TABLE_NAME__/$UUID/g" $WORKING_DIR/import_array_manifest_test_tmp.json > $WORKING_DIR/import_array_manifest_test_mod.json
echo "JSON FILE (modified) ======="
sed -r "s/__UUID__/$UUID/g" $WORKING_DIR/import_arrays_test_tmp.json > $WORKING_DIR/import_arrays_test_mod.json
echo "MANIFEST JSON FILE (modified) ======="
cat $WORKING_DIR/import_array_manifest_test_mod.json
echo "INGEST JSON FILE (modified) ======="
cat $WORKING_DIR/import_arrays_test_mod.json

sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" variantstore_cromwell_tests/local-with-gcs.conf >$WORKING_DIR/set_up.conf
sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" $CROMWELL_TEST_DIR/local-with-gcs.conf >$WORKING_DIR/set_up.conf
echo "Updated local_backend.conf with service account"

echo "Running ImportArrayManifest WDL through cromwell"
ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl
sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl -i $WORKING_DIR/import_array_manifest_test_mod.json -m $WORKING_DIR/test_import_manifest_wdl.metadata

echo "Running ImportArrays WDL through cromwell"
ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrays.wdl
sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrays.wdl -i $WORKING_DIR/import_arrays_test_mod.json
224 changes: 224 additions & 0 deletions scripts/variantstore_wdl/ImportArrays.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
version 1.0

workflow ImportArrays {

input {
Array[File] input_vcfs
Array[File]? input_metrics
String? probe_info_table
File? probe_info_file
String output_directory
File sample_map
String project_id
String dataset_name
File raw_schema
File sample_list_schema
#TODO: determine table_id from input sample_map (including looping over multiple table_ids)
Int table_id

Int? preemptible_tries
File? gatk_override
String? docker
}

String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"])

scatter (i in range(length(input_vcfs))) {
if (defined(input_metrics)) {
File input_metric = select_first([input_metrics])[i]
}

call CreateImportTsvs {
input:
input_vcf = input_vcfs[i],
input_metrics = input_metric,
probe_info_table = probe_info_table,
probe_info_file = probe_info_file,
sample_map = sample_map,
output_directory = output_directory,
gatk_override = gatk_override,
docker = docker_final,
preemptible_tries = preemptible_tries
}
}

call LoadArrays {
input:
sample_tsvs = CreateImportTsvs.sample_tsv,
project_id = project_id,
dataset_name = dataset_name,
storage_location = output_directory,
table_id = table_id,
raw_schema = raw_schema,
sample_list_schema = sample_list_schema,
preemptible_tries = preemptible_tries,
docker = docker_final
}
}


task CreateImportTsvs {
input {
File input_vcf
File? input_metrics
String? probe_info_table
File? probe_info_file
String output_directory
File sample_map

# runtime
Int? preemptible_tries
File? gatk_override
String docker

String? for_testing_only
}

Int disk_size = ceil(size(input_vcf, "GB") * 2.5) + 20

meta {
description: "Creates a tsv file for imort into BigQuery"
}
parameter_meta {
input_vcf: {
localization_optional: true
}
}
command <<<
set -e

#workaround for https://github.com/broadinstitute/cromwell/issues/3647
export TMPDIR=/tmp

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
~{for_testing_only}

gatk --java-options "-Xmx2500m" CreateArrayIngestFiles \
-V ~{input_vcf} \
~{"-QCF " + input_metrics} \
~{"--probe-info-file " + probe_info_file} \
~{"--probe-info-table " + probe_info_table} \
-SNM ~{sample_map} \
--ref-version 37

gsutil cp sample_*.tsv ~{output_directory}/sample_tsvs/
gsutil cp raw_*.tsv ~{output_directory}/raw_tsvs/
>>>
runtime {
docker: docker
memory: "4 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 2
}
output {
File sample_tsv = glob("sample_*.tsv")[0]
File arraydata_tsv = glob("raw_*.tsv")[0]
}
}

task LoadArrays {
input {
String project_id
String dataset_name
String storage_location
Int table_id
File raw_schema
File sample_list_schema
String load = "true"
String uuid = ""

#input from previous task needed to delay task from running until the other is complete
Array[String] sample_tsvs

# runtime
Int? preemptible_tries
String docker

String? for_testing_only
}

command <<<
set -e
~{for_testing_only}

SAMPLE_DIR=~{storage_location}/sample_tsvs/
RAW_DIR=~{storage_location}/raw_tsvs/

let "PARTITION_START=(~{table_id}-1)*4000+1"
let "PARTITION_END=$PARTITION_START+3999"
let "PARTITION_STEP=1"
PARTITION_FIELD="sample_id"
printf -v PADDED_TABLE_ID "%03d" ~{table_id}

RAW_FILES="raw_${PADDED_TABLE_ID}_*"
METADATA_FILES="sample_${PADDED_TABLE_ID}_*"

NUM_RAW_FILES=$(gsutil ls $RAW_DIR${RAW_FILES} | wc -l)
NUM_METADATA_FILES=$(gsutil ls $SAMPLE_DIR${METADATA_FILES} | wc -l)

if [ $NUM_RAW_FILES -eq 0 -a $NUM_METADATA_FILES -eq 0 ]; then
"no files for table ${PADDED_TABLE_ID} to process in ~{storage_location}; exiting"
exit
fi

# create a metadata table and load
SAMPLE_LIST_TABLE="~{dataset_name}.~{uuid + "_"}sample_list"
if [ $NUM_METADATA_FILES -gt 0 ]; then
set +e
bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making dataset ~{dataset_name}"
bq mk --project_id=~{project_id} ~{dataset_name}
fi
set +e
bq show --project_id ~{project_id} $SAMPLE_LIST_TABLE > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making table $SAMPLE_LIST_TABLE"
bq --location=US mk --project_id=~{project_id} $SAMPLE_LIST_TABLE ~{sample_list_schema}
#TODO: add a Google Storage Transfer for the table when we make it.
fi
#load should be false if using Google Storage Transfer so that the tables will be created by this script, but no data will be uploaded.
if [ ~{load} = true ]; then
bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $SAMPLE_LIST_TABLE $SAMPLE_DIR$METADATA_FILES ~{sample_list_schema}
echo "ingested ${METADATA_FILES} file from $SAMPLE_DIR into table $SAMPLE_LIST_TABLE"
else
echo "${METADATA_FILES} will be ingested from $SAMPLE_DIR by Google Storage Transfer"
fi
else
echo "no metadata files to process"
fi

# create array table
TABLE="~{dataset_name}.~{uuid + "_"}arrays_${PADDED_TABLE_ID}"
if [ $NUM_RAW_FILES -gt 0 ]; then
set +e
bq show --project_id ~{project_id} $TABLE > /dev/null
set -e
if [ $? -ne 0 ]; then
echo "making table $TABLE"
bq --location=US mk --range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP \
--project_id=~{project_id} $TABLE ~{raw_schema}
#TODO: add a Google Storage Transfer for the table when we make it.
fi
if [ ~{load} = true ]; then
bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --null_marker="null" --source_format=CSV -F "\t" $TABLE $RAW_DIR$RAW_FILES ~{raw_schema}
echo "ingested ${RAW_FILES} files from $RAW_DIR into table $TABLE"
else
echo "${RAW_FILES} will be ingested from $RAW_DIR
by Google Storage Transfer"
fi
else
echo "no raw data files to process"
fi
>>>
runtime {
docker: docker
memory: "4 GB"
disks: "local-disk 10 HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 2
}
}
105 changes: 105 additions & 0 deletions scripts/variantstore_wdl/schemas/arrays_sample_list_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
[
{
"description": "[DESCRIPTION]",
"name": "sample_id",
"type": "Integer",
"mode": "Required"
},
{
"description": "[DESCRIPTION]",
"name": "sample_name",
"type": "String",
"mode": "Required"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_ASSAYS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_NON_FILTERED_ASSAYS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_FILTERED_ASSAYS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_ZEROED_OUT_ASSAYS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_SNPS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_INDELS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_CALLS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_AUTOCALL_CALLS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_NO_CALLS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_IN_DB_SNP",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NOVEL_SNPS",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "PCT_DBSNP",
"type": "Float",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "CALL_RATE",
"type": "Float",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "AUTOCALL_CALL_RATE",
"type": "Float",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "NUM_SINGLETONS",
"type": "Integer",
"mode": "Nullable"
}
]

Loading

0 comments on commit 19dd991

Please sign in to comment.