From aadfacb9ae85dc2463ba727a963a6cb3eba74f88 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 19 Jul 2017 15:02:26 +0100 Subject: [PATCH] Add Spark eval scripts for sanity checking on small input. --- scripts/spark_eval/prep_data_small_gcs.sh | 33 +++++++++++++++++++ scripts/spark_eval/small_pipeline.sh | 9 +++++ scripts/spark_eval/small_pipeline_gcs_hdfs.sh | 9 +++++ 3 files changed, 51 insertions(+) create mode 100755 scripts/spark_eval/prep_data_small_gcs.sh create mode 100755 scripts/spark_eval/small_pipeline.sh create mode 100755 scripts/spark_eval/small_pipeline_gcs_hdfs.sh diff --git a/scripts/spark_eval/prep_data_small_gcs.sh b/scripts/spark_eval/prep_data_small_gcs.sh new file mode 100755 index 00000000000..540fdadd7d7 --- /dev/null +++ b/scripts/spark_eval/prep_data_small_gcs.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# Download all required data for small BAM and store in HDFS. + +TARGET_DIR=${1:-small_spark_eval} + +hadoop fs -stat $TARGET_DIR > /dev/null 2>&1 +if [ $? -eq 0 ]; then + echo "$TARGET_DIR already exists. Delete it and try again." + exit 1 +fi + +set -e +set -x + +# Create data directory in HDFS +hadoop fs -mkdir -p $TARGET_DIR + +# Download exome BAM +gsutil cp gs://hellbender/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam - | hadoop fs -put - $TARGET_DIR/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam +gsutil cp gs://hellbender/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam.bai - | hadoop fs -put - $TARGET_DIR/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam.bai + +# Download reference +gsutil cp gs://hellbender/test/resources/large/human_g1k_v37.20.21.2bit - | hadoop fs -put - $TARGET_DIR/human_g1k_v37.20.21.2bit +gsutil cp gs://hellbender/test/resources/large/human_g1k_v37.20.21.dict - | hadoop fs -put - $TARGET_DIR/human_g1k_v37.20.21.dict +gsutil cp gs://hellbender/test/resources/large/human_g1k_v37.20.21.fasta.fai - | hadoop fs -put - $TARGET_DIR/human_g1k_v37.20.21.fasta.fai +gsutil cp gs://hellbender/test/resources/large/human_g1k_v37.20.21.fasta - | hadoop fs -put - $TARGET_DIR/human_g1k_v37.20.21.fasta + +# Download known sites VCF +gsutil cp gs://hellbender/test/resources/large/dbsnp_138.b37.20.21.vcf - | hadoop fs -put - $TARGET_DIR/dbsnp_138.b37.20.21.vcf + +# List data +hadoop fs -ls -h $TARGET_DIR diff --git a/scripts/spark_eval/small_pipeline.sh b/scripts/spark_eval/small_pipeline.sh new file mode 100755 index 00000000000..e023ece8bbd --- /dev/null +++ b/scripts/spark_eval/small_pipeline.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +# Run the pipeline (Mark Duplicates, BQSR, Haplotype Caller) on small data on a Spark cluster. + +. utils.sh + +time_gatk "MarkDuplicatesSpark -I hdfs:///user/$USER/small_spark_eval/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam -O hdfs:///user/$USER/small_spark_eval/out/markdups-sharded --shardedOutput true" 8 1 4g 4g +time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/small_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/small_spark_eval/out/bqsr-sharded --shardedOutput true -R hdfs:///user/$USER/small_spark_eval/human_g1k_v37.20.21.2bit --knownSites hdfs:///user/$USER/small_spark_eval/dbsnp_138.b37.20.21.vcf -L 20:10000000-10100000" 1 8 32g 4g +time_gatk "HaplotypeCallerSpark -I hdfs:///user/$USER/small_spark_eval/out/bqsr-sharded -R hdfs:///user/$USER/small_spark_eval/human_g1k_v37.20.21.2bit -O hdfs:///user/$USER/small_spark_eval/out/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf -pairHMM AVX_LOGLESS_CACHING" 8 1 4g 4g diff --git a/scripts/spark_eval/small_pipeline_gcs_hdfs.sh b/scripts/spark_eval/small_pipeline_gcs_hdfs.sh new file mode 100755 index 00000000000..08731cfe1bd --- /dev/null +++ b/scripts/spark_eval/small_pipeline_gcs_hdfs.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +# Run the pipeline (Mark Duplicates, BQSR, Haplotype Caller) on small data on a GCS Dataproc cluster. Data is in HDFS. + +. utils.sh + +time_gatk "MarkDuplicatesSpark -I hdfs:///user/$USER/small_spark_eval/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam -O hdfs:///user/$USER/small_spark_eval/out/markdups-sharded --shardedOutput true" 8 1 4g 4g +time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/small_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/small_spark_eval/out/bqsr-sharded --shardedOutput true -R hdfs:///user/$USER/small_spark_eval/human_g1k_v37.20.21.2bit --knownSites hdfs://${GCS_CLUSTER}-m:8020/user/$USER/small_spark_eval/dbsnp_138.b37.20.21.vcf -L 20:10000000-10100000" 1 8 32g 4g +time_gatk "HaplotypeCallerSpark -I hdfs:///user/$USER/small_spark_eval/out/bqsr-sharded -R hdfs:///user/$USER/small_spark_eval/human_g1k_v37.20.21.2bit -O hdfs://${GCS_CLUSTER}-m:8020/user/$USER/small_spark_eval/out/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf -pairHMM AVX_LOGLESS_CACHING" 8 1 4g 4g