From a00be65050845c09fae55a6fc981dbf5a266fbb6 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Mon, 6 Jul 2020 10:38:06 -0400 Subject: [PATCH] derisk/simplify Fetch_SRA_to_BAM task to make it 1) more fault tolerant for optional metadata fields and 2) rely on shell script/bam header for important metaata fields --- pipes/WDL/tasks/tasks_ncbi_tools.wdl | 64 ++++++++++++---------------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/pipes/WDL/tasks/tasks_ncbi_tools.wdl b/pipes/WDL/tasks/tasks_ncbi_tools.wdl index 935f470d3..a294450db 100644 --- a/pipes/WDL/tasks/tasks_ncbi_tools.wdl +++ b/pipes/WDL/tasks/tasks_ncbi_tools.wdl @@ -10,45 +10,37 @@ task Fetch_SRA_to_BAM { } command { - set -euxo pipefail + # pull reads from SRA and make a fully annotated BAM -- must succeed + set -ex + /opt/docker/scripts/sra_to_ubam.sh "${SRA_ID}" "${SRA_ID}.bam" - # pull reads from SRA and make a fully annotated BAM - /opt/docker/scripts/sra_to_ubam.sh ${SRA_ID} ${SRA_ID}.bam + # pull most metadata from BAM header + set +e + samtools view -H "${SRA_ID}.bam" | grep ^@RG | head -1 | tr '\t' '\n' > header.txt + grep CN header.txt | cut -f 2- -d : | tee OUT_CENTER + grep PL header.txt | cut -f 2- -d : | tee OUT_PLATFORM + grep SM header.txt | cut -f 2- -d : | tee OUT_BIOSAMPLE + grep LB header.txt | cut -f 2- -d : | tee OUT_LIBRARY + grep DT header.txt | cut -f 2 -d : | cut -f 1 -d T | tee OUT_RUNDATE - # pull other metadata from SRA - esearch -db sra -q "${SRA_ID}" | efetch -mode json -json > ${SRA_ID}.json - - cat ${SRA_ID}.json | jq -r \ - '.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.SUBMISSION.center_name' \ - | tee OUT_CENTER - cat ${SRA_ID}.json | jq -r \ - '.EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.PLATFORM | keys[] as $k | "\($k)"' \ - | tee OUT_PLATFORM - cat ${SRA_ID}.json | jq -r \ - .EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.PLATFORM.$( SRA.json + jq -r \ + .EXPERIMENT_PACKAGE_SET.EXPERIMENT_PACKAGE.EXPERIMENT.PLATFORM."$(