Skip to content

Commit

Permalink
Add Spark eval scripts for sanity checking on small input.
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwhite committed Jul 24, 2017
1 parent 242d13e commit aadfacb
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 0 deletions.
33 changes: 33 additions & 0 deletions scripts/spark_eval/prep_data_small_gcs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env bash

# Download all required data for small BAM and store in HDFS.

TARGET_DIR=${1:-small_spark_eval}

hadoop fs -stat $TARGET_DIR > /dev/null 2>&1
if [ $? -eq 0 ]; then
echo "$TARGET_DIR already exists. Delete it and try again."
exit 1
fi

set -e
set -x

# Create data directory in HDFS
hadoop fs -mkdir -p $TARGET_DIR

# Download exome BAM
gsutil cp gs://hellbender/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam - | hadoop fs -put - $TARGET_DIR/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam
gsutil cp gs://hellbender/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam.bai - | hadoop fs -put - $TARGET_DIR/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam.bai

# Download reference
gsutil cp gs://hellbender/test/resources/large/human_g1k_v37.20.21.2bit - | hadoop fs -put - $TARGET_DIR/human_g1k_v37.20.21.2bit
gsutil cp gs://hellbender/test/resources/large/human_g1k_v37.20.21.dict - | hadoop fs -put - $TARGET_DIR/human_g1k_v37.20.21.dict
gsutil cp gs://hellbender/test/resources/large/human_g1k_v37.20.21.fasta.fai - | hadoop fs -put - $TARGET_DIR/human_g1k_v37.20.21.fasta.fai
gsutil cp gs://hellbender/test/resources/large/human_g1k_v37.20.21.fasta - | hadoop fs -put - $TARGET_DIR/human_g1k_v37.20.21.fasta

# Download known sites VCF
gsutil cp gs://hellbender/test/resources/large/dbsnp_138.b37.20.21.vcf - | hadoop fs -put - $TARGET_DIR/dbsnp_138.b37.20.21.vcf

# List data
hadoop fs -ls -h $TARGET_DIR
9 changes: 9 additions & 0 deletions scripts/spark_eval/small_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

# Run the pipeline (Mark Duplicates, BQSR, Haplotype Caller) on small data on a Spark cluster.

. utils.sh

time_gatk "MarkDuplicatesSpark -I hdfs:///user/$USER/small_spark_eval/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam -O hdfs:///user/$USER/small_spark_eval/out/markdups-sharded --shardedOutput true" 8 1 4g 4g
time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/small_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/small_spark_eval/out/bqsr-sharded --shardedOutput true -R hdfs:///user/$USER/small_spark_eval/human_g1k_v37.20.21.2bit --knownSites hdfs:///user/$USER/small_spark_eval/dbsnp_138.b37.20.21.vcf -L 20:10000000-10100000" 1 8 32g 4g
time_gatk "HaplotypeCallerSpark -I hdfs:///user/$USER/small_spark_eval/out/bqsr-sharded -R hdfs:///user/$USER/small_spark_eval/human_g1k_v37.20.21.2bit -O hdfs:///user/$USER/small_spark_eval/out/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf -pairHMM AVX_LOGLESS_CACHING" 8 1 4g 4g
9 changes: 9 additions & 0 deletions scripts/spark_eval/small_pipeline_gcs_hdfs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env bash

# Run the pipeline (Mark Duplicates, BQSR, Haplotype Caller) on small data on a GCS Dataproc cluster. Data is in HDFS.

. utils.sh

time_gatk "MarkDuplicatesSpark -I hdfs:///user/$USER/small_spark_eval/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam -O hdfs:///user/$USER/small_spark_eval/out/markdups-sharded --shardedOutput true" 8 1 4g 4g
time_gatk "BQSRPipelineSpark -I hdfs:///user/$USER/small_spark_eval/out/markdups-sharded -O hdfs:///user/$USER/small_spark_eval/out/bqsr-sharded --shardedOutput true -R hdfs:///user/$USER/small_spark_eval/human_g1k_v37.20.21.2bit --knownSites hdfs://${GCS_CLUSTER}-m:8020/user/$USER/small_spark_eval/dbsnp_138.b37.20.21.vcf -L 20:10000000-10100000" 1 8 32g 4g
time_gatk "HaplotypeCallerSpark -I hdfs:///user/$USER/small_spark_eval/out/bqsr-sharded -R hdfs:///user/$USER/small_spark_eval/human_g1k_v37.20.21.2bit -O hdfs://${GCS_CLUSTER}-m:8020/user/$USER/small_spark_eval/out/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.vcf -pairHMM AVX_LOGLESS_CACHING" 8 1 4g 4g

0 comments on commit aadfacb

Please sign in to comment.