Skip to content

Commit

Permalink
derisk/simplify Fetch_SRA_to_BAM task to make it 1) more fault tolera…
Browse files Browse the repository at this point in the history
…nt for optional metadata fields and 2) rely on shell script/bam header for important metaata fields
  • Loading branch information
dpark01 committed Jul 6, 2020
1 parent 2c4d5bd commit a00be65
Showing 1 changed file with 28 additions and 36 deletions.
64 changes: 28 additions & 36 deletions pipes/WDL/tasks/tasks_ncbi_tools.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,37 @@ task Fetch_SRA_to_BAM {
}

command {
set -euxo pipefail
# pull reads from SRA and make a fully annotated BAM -- must succeed
set -ex
/opt/docker/scripts/sra_to_ubam.sh "${SRA_ID}" "${SRA_ID}.bam"

# pull reads from SRA and make a fully annotated BAM
/opt/docker/scripts/sra_to_ubam.sh ${SRA_ID} ${SRA_ID}.bam
# pull most metadata from BAM header
set +e
samtools view -H "${SRA_ID}.bam" | grep ^@RG | head -1 | tr '\t' '\n' > header.txt
grep CN header.txt | cut -f 2- -d : | tee OUT_CENTER
grep PL header.txt | cut -f 2- -d : | tee OUT_PLATFORM
grep SM header.txt | cut -f 2- -d : | tee OUT_BIOSAMPLE
grep LB header.txt | cut -f 2- -d : | tee OUT_LIBRARY
grep DT header.txt | cut -f 2 -d : | cut -f 1 -d T | tee OUT_RUNDATE

# pull other metadata from SRA
esearch -db sra -q "${SRA_ID}" | efetch -mode json -json > ${SRA_ID}.json

cat ${SRA_ID}.json | jq -r \
'.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SUBMISSION.center_name' \
| tee OUT_CENTER
cat ${SRA_ID}.json | jq -r \
'.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.PLATFORM | keys[] as $k | "\($k)"' \
| tee OUT_PLATFORM
cat ${SRA_ID}.json | jq -r \
.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.PLATFORM.$(<OUT_PLATFORM).INSTRUMENT_MODEL \
| tee OUT_MODEL
cat ${SRA_ID}.json | jq -r \
'.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SAMPLE.IDENTIFIERS.EXTERNAL_ID|select(.namespace == "BioSample")|.content' \
| tee OUT_BIOSAMPLE
cat ${SRA_ID}.json | jq -r \
.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.DESIGN.LIBRARY_DESCRIPTOR.LIBRARY_NAME \
| tee OUT_LIBRARY
cat ${SRA_ID}.json | jq -r \
'.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.RUN_SET.RUN.SRAFiles.SRAFile[]|select(.supertype == "Original")|.date' \
| cut -f 1 -d ' ' \
| tee OUT_RUNDATE
cat ${SRA_ID}.json | jq -r \
# pull other metadata from SRA -- allow for silent failures here!
touch OUT_MODEL OUT_COLLECTION_DATE OUT_STRAIN OUT_COLLECTED_BY OUT_GEO_LOC
esearch -db sra -q "${SRA_ID}" | efetch -mode json -json > SRA.json
jq -r \
.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.PLATFORM."$(<OUT_PLATFORM)".INSTRUMENT_MODEL \
SRA.json | tee OUT_MODEL
jq -r \
'.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SAMPLE.SAMPLE_ATTRIBUTES.SAMPLE_ATTRIBUTE[]|select(.TAG == "collection_date")|.VALUE' \
| tee OUT_COLLECTION_DATE
cat ${SRA_ID}.json | jq -r \
SRA.json | tee OUT_COLLECTION_DATE
jq -r \
'.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SAMPLE.SAMPLE_ATTRIBUTES.SAMPLE_ATTRIBUTE[]|select(.TAG == "strain")|.VALUE' \
| tee OUT_STRAIN
cat ${SRA_ID}.json | jq -r \
SRA.json | tee OUT_STRAIN
jq -r \
'.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SAMPLE.SAMPLE_ATTRIBUTES.SAMPLE_ATTRIBUTE[]|select(.TAG == "collected_by")|.VALUE' \
| tee OUT_COLLECTED_BY
cat ${SRA_ID}.json | jq -r \
SRA.json | tee OUT_COLLECTED_BY
jq -r \
'.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SAMPLE.SAMPLE_ATTRIBUTES.SAMPLE_ATTRIBUTE[]|select(.TAG == "geo_loc_name")|.VALUE' \
| tee OUT_GEO_LOC
SRA.json | tee OUT_GEO_LOC
}

output {
Expand All @@ -67,10 +59,10 @@ task Fetch_SRA_to_BAM {
}

runtime {
cpu: 4
memory: select_first([machine_mem_gb, 15]) + " GB"
cpu: 2
memory: select_first([machine_mem_gb, 6]) + " GB"
disks: "local-disk 750 LOCAL"
dx_instance_type: "mem2_ssd1_v2_x4"
dx_instance_type: "mem2_ssd1_v2_x2"
docker: "${docker}"
}
}

0 comments on commit a00be65

Please sign in to comment.