Skip to content

Commit

Permalink
Update (#7)
Browse files Browse the repository at this point in the history
* Update JointGenotyping to use gatk4.0.1.0

* distinguish gatk4 version requirements for PairedEndSingleSampleWf and JointGenotypingWf
  • Loading branch information
bshifaw authored May 29, 2018
1 parent 063e55f commit 904564a
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 82 deletions.
2 changes: 1 addition & 1 deletion JointGenotypingWf.hg38.inputs.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"##_COMMENT1": "SAMPLE-SPECIFIC - MUST BE ADAPTED TO YOUR DATA !!!",
"JointGenotyping.callset_name": "hg38_trio",
"JointGenotyping.sample_name_map" : "",
"JointGenotyping.sample_name_map" : "gs://gatk-test-data/joint_discovery/NA12878.sample_map",

"##_COMMENT2": "INTERVALS",
"JointGenotyping.call_interval_list": "gs://broad-references/hg38/v0/wgs_calling_regions.v1.interval_list",
Expand Down
3 changes: 2 additions & 1 deletion JointGenotypingWf.options.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
"read_from_cache":true,
"write_to_cache":true,
"workflow_failure_mode" : "ContinueWhilePossible",
"##Comment":"Users may use the following docker image temporarily until broadinstitute/genomes-in-the-cloud is updated ",
"default_runtime_attributes": {
"zones": "us-central1-a us-central1-b us-central1-c us-central1-f",
"docker": "broadinstitute/genomes-in-the-cloud:2.3.1-1504795437"
"docker": "bshifaw/gatk4.0.1.0-joint-genotyping:1.0.0.0"
}
}
181 changes: 104 additions & 77 deletions JointGenotypingWf.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ workflow JointGenotyping {

String callset_name
File sample_name_map

File ref_fasta
File ref_fasta_index
File ref_dict
Expand Down Expand Up @@ -121,27 +121,6 @@ workflow JointGenotyping {
disk_size = medium_disk
}

call SNPsVariantRecalibratorCreateModel {
input:
sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
recalibration_filename = callset_name + ".snps.recal",
tranches_filename = callset_name + ".snps.tranches",
recalibration_tranche_values = snp_recalibration_tranche_values,
recalibration_annotation_values = snp_recalibration_annotation_values,
downsampleFactor = SNP_VQSR_downsampleFactor,
model_report_filename = callset_name + ".snps.model.report",
hapmap_resource_vcf = hapmap_resource_vcf,
hapmap_resource_vcf_index = hapmap_resource_vcf_index,
omni_resource_vcf = omni_resource_vcf,
omni_resource_vcf_index = omni_resource_vcf_index,
one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
dbsnp_resource_vcf = dbsnp_resource_vcf,
dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
disk_size = small_disk
}

call IndelsVariantRecalibrator {
input:
sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
Expand All @@ -159,8 +138,30 @@ workflow JointGenotyping {
disk_size = small_disk
}

if (num_gvcfs > 10000) {
call SNPsVariantRecalibratorCreateModel {
input:
sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
recalibration_filename = callset_name + ".snps.recal",
tranches_filename = callset_name + ".snps.tranches",
recalibration_tranche_values = snp_recalibration_tranche_values,
recalibration_annotation_values = snp_recalibration_annotation_values,
downsampleFactor = SNP_VQSR_downsampleFactor,
model_report_filename = callset_name + ".snps.model.report",
hapmap_resource_vcf = hapmap_resource_vcf,
hapmap_resource_vcf_index = hapmap_resource_vcf_index,
omni_resource_vcf = omni_resource_vcf,
omni_resource_vcf_index = omni_resource_vcf_index,
one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
dbsnp_resource_vcf = dbsnp_resource_vcf,
dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
disk_size = small_disk
}

scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) {
call SNPsVariantRecalibratorScattered {
call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
input:
sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx],
sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx],
Expand All @@ -179,13 +180,35 @@ workflow JointGenotyping {
dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
disk_size = small_disk
}
}
call GatherTranches as SNPGatherTranches {
input:
input_fofn = write_lines(SNPsVariantRecalibratorScattered.tranches),
output_filename = callset_name + ".snps.gathered.tranches",
disk_size = small_disk
}
}

call GatherTranches as SNPGatherTranches {
input:
input_fofn = write_lines(SNPsVariantRecalibratorScattered.tranches),
output_filename = callset_name + ".snps.gathered.tranches",
disk_size = small_disk

if (num_gvcfs <= 10000){
call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
input:
sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
recalibration_filename = callset_name + ".snps.recal",
tranches_filename = callset_name + ".snps.tranches",
recalibration_tranche_values = snp_recalibration_tranche_values,
recalibration_annotation_values = snp_recalibration_annotation_values,
hapmap_resource_vcf = hapmap_resource_vcf,
hapmap_resource_vcf_index = hapmap_resource_vcf_index,
omni_resource_vcf = omni_resource_vcf,
omni_resource_vcf_index = omni_resource_vcf_index,
one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
dbsnp_resource_vcf = dbsnp_resource_vcf,
dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
disk_size = small_disk
}
}

# For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly.
Expand All @@ -201,9 +224,9 @@ workflow JointGenotyping {
indels_recalibration = IndelsVariantRecalibrator.recalibration,
indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index,
indels_tranches = IndelsVariantRecalibrator.tranches,
snps_recalibration = SNPsVariantRecalibratorScattered.recalibration[idx],
snps_recalibration_index = SNPsVariantRecalibratorScattered.recalibration_index[idx],
snps_tranches = SNPGatherTranches.tranches,
snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]),
snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]),
snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]),
indel_filter_level = indel_filter_level,
snp_filter_level = snp_filter_level,
disk_size = medium_disk
Expand Down Expand Up @@ -309,13 +332,13 @@ task ImportGVCFs {
# a significant amount of non-heap memory for native libraries.
# Also, testing has shown that the multithreaded reader initialization
# does not scale well beyond 5 threads, so don't increase beyond that.
/usr/gitc/gatk-launch --javaOptions "-Xmx4g -Xms4g" \
/gatk/gatk --java-options "-Xmx4g -Xms4g" \
GenomicsDBImport \
--genomicsDBWorkspace ${workspace_dir_name} \
--batchSize ${batch_size} \
--genomicsdb-workspace-path ${workspace_dir_name} \
--batch-size ${batch_size} \
-L ${interval} \
--sampleNameMap ${sample_name_map} \
--readerThreads 5 \
--sample-name-map ${sample_name_map} \
--reader-threads 5 \
-ip 500

tar -cf ${workspace_dir_name}.tar ${workspace_dir_name}
Expand All @@ -326,6 +349,7 @@ task ImportGVCFs {
cpu: "2"
disks: "local-disk " + disk_size + " HDD"
preemptible: 5
docker: "broadinstitute/gatk:4.0.1.1"
}
output {
File output_genomicsdb = "${workspace_dir_name}.tar"
Expand All @@ -352,14 +376,14 @@ task GenotypeGVCFs {
tar -xf ${workspace_tar}
WORKSPACE=$( basename ${workspace_tar} .tar)

/usr/gitc/gatk-launch --javaOptions "-Xmx5g -Xms5g" \
/usr/gitc/gatk --java-options "-Xmx5g -Xms5g" \
GenotypeGVCFs \
-R ${ref_fasta} \
-O ${output_vcf_filename} \
-D ${dbsnp_vcf} \
-G StandardAnnotation \
--onlyOutputCallsStartingInIntervals \
-newQual \
--only-output-calls-starting-in-intervals \
--use-new-qual-calculator \
-V gendb://$WORKSPACE \
-L ${interval}
>>>
Expand Down Expand Up @@ -388,10 +412,10 @@ task HardFilterAndMakeSitesOnlyVcf {
command {
set -e

/usr/gitc/gatk-launch --javaOptions "-Xmx3g -Xms3g" \
/usr/gitc/gatk --java-options "-Xmx3g -Xms3g" \
VariantFiltration \
--filterExpression "ExcessHet > ${excess_het_threshold}" \
--filterName ExcessHet \
--filter-expression "ExcessHet > ${excess_het_threshold}" \
--filter-name ExcessHet \
-O ${variant_filtered_vcf_filename} \
-V ${vcf}

Expand Down Expand Up @@ -435,16 +459,16 @@ task IndelsVariantRecalibrator {
Int disk_size

command {
/usr/gitc/gatk-launch --javaOptions "-Xmx24g -Xms24g" \
/usr/gitc/gatk --java-options "-Xmx24g -Xms24g" \
VariantRecalibrator \
-V ${sites_only_variant_filtered_vcf} \
-O ${recalibration_filename} \
--tranchesFile ${tranches_filename} \
-allPoly \
--tranches-file ${tranches_filename} \
--trust-all-polymorphic \
-tranche ${sep=' -tranche ' recalibration_tranche_values} \
-an ${sep=' -an ' recalibration_annotation_values} \
-mode INDEL \
--maxGaussians 4 \
--max-gaussians 4 \
-resource mills,known=false,training=true,truth=true,prior=12:${mills_resource_vcf} \
-resource axiomPoly,known=false,training=true,truth=false,prior=10:${axiomPoly_resource_vcf} \
-resource dbsnp,known=true,training=false,truth=false,prior=2:${dbsnp_resource_vcf}
Expand Down Expand Up @@ -486,18 +510,18 @@ task SNPsVariantRecalibratorCreateModel {
Int disk_size

command {
/usr/gitc/gatk-launch --javaOptions "-Xmx100g -Xms100g" \
/usr/gitc/gatk --java-options "-Xmx100g -Xms100g" \
VariantRecalibrator \
-V ${sites_only_variant_filtered_vcf} \
-O ${recalibration_filename} \
--tranchesFile ${tranches_filename} \
-allPoly \
--tranches-file ${tranches_filename} \
--trust-all-polymorphic \
-tranche ${sep=' -tranche ' recalibration_tranche_values} \
-an ${sep=' -an ' recalibration_annotation_values} \
-mode SNP \
-sampleEvery ${downsampleFactor} \
--output_model ${model_report_filename} \
--maxGaussians 6 \
--sample-every-Nth-variant ${downsampleFactor} \
--output-model ${model_report_filename} \
--max-gaussians 6 \
-resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \
-resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \
-resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \
Expand All @@ -514,10 +538,10 @@ task SNPsVariantRecalibratorCreateModel {
}
}

task SNPsVariantRecalibratorScattered {
task SNPsVariantRecalibrator {
String recalibration_filename
String tranches_filename
File model_report
File? model_report

Array[String] recalibration_tranche_values
Array[String] recalibration_annotation_values
Expand All @@ -537,18 +561,17 @@ task SNPsVariantRecalibratorScattered {
Int disk_size

command {
/usr/gitc/gatk-launch --javaOptions "-Xmx3g -Xms3g" \
/usr/gitc/gatk --java-options "-Xmx3g -Xms3g" \
VariantRecalibrator \
-V ${sites_only_variant_filtered_vcf} \
-O ${recalibration_filename} \
--tranchesFile ${tranches_filename} \
-allPoly \
--tranches-file ${tranches_filename} \
--trust-all-polymorphic \
-tranche ${sep=' -tranche ' recalibration_tranche_values} \
-an ${sep=' -an ' recalibration_annotation_values} \
-mode SNP \
--input_model ${model_report} \
-scatterTranches \
--maxGaussians 6 \
${"--input-model " + model_report + " --output-tranches-for-scatter "} \
--max-gaussians 6 \
-resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \
-resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \
-resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \
Expand Down Expand Up @@ -594,7 +617,7 @@ task GatherTranches {

cat ${input_fofn} | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list

/usr/gitc/gatk-launch --javaOptions "-Xmx6g -Xms6g" \
/gatk/gatk --java-options "-Xmx6g -Xms6g" \
GatherTranches \
--input inputs.list \
--output ${output_filename}
Expand All @@ -604,6 +627,7 @@ task GatherTranches {
cpu: "2"
disks: "local-disk " + disk_size + " HDD"
preemptible: 5
docker: "broadinstitute/gatk:4.0.1.1"
}
output {
File tranches = "${output_filename}"
Expand All @@ -629,24 +653,24 @@ task ApplyRecalibration {
command {
set -e

/usr/gitc/gatk-launch --javaOptions "-Xmx5g -Xms5g" \
/usr/gitc/gatk --java-options "-Xmx5g -Xms5g" \
ApplyVQSR \
-O tmp.indel.recalibrated.vcf \
-V ${input_vcf} \
--recalFile ${indels_recalibration} \
-tranchesFile ${indels_tranches} \
-ts_filter_level ${indel_filter_level} \
--createOutputVariantIndex true \
--recal-file ${indels_recalibration} \
--tranches-file ${indels_tranches} \
--truth-sensitivity-filter-level ${indel_filter_level} \
--create-output-variant-index true \
-mode INDEL
/usr/gitc/gatk-launch --javaOptions "-Xmx5g -Xms5g" \

/usr/gitc/gatk --java-options "-Xmx5g -Xms5g" \
ApplyVQSR \
-O ${recalibrated_vcf_filename} \
-V tmp.indel.recalibrated.vcf \
--recalFile ${snps_recalibration} \
-tranchesFile ${snps_tranches} \
-ts_filter_level ${snp_filter_level} \
--createOutputVariantIndex true \
--recal-file ${snps_recalibration} \
--tranches-file ${snps_tranches} \
--truth-sensitivity-filter-level ${snp_filter_level} \
--create-output-variant-index true \
-mode SNP
}
runtime {
Expand All @@ -673,11 +697,13 @@ task GatherVcfs {
# Now using NIO to localize the vcfs but the input file must have a ".list" extension
mv ${input_vcfs_fofn} inputs.list

# ignoreSafetyChecks make a big performance difference so we include it in our invocation
/usr/gitc/gatk-launch --javaOptions "-Xmx6g -Xms6g" \
# --ignore-safety-checks makes a big performance difference so we include it in our invocation.
# This argument disables expensive checks that the file headers contain the same set of
# genotyped samples and that files are in order by position of first record.
/usr/gitc/gatk --java-options "-Xmx6g -Xms6g" \
GatherVcfsCloud \
--ignoreSafetyChecks \
--gatherType BLOCK \
--ignore-safety-checks \
--gather-type BLOCK \
--input inputs.list \
--output ${output_vcf_name}

Expand All @@ -698,7 +724,7 @@ task GatherVcfs {
task CollectVariantCallingMetrics {
File input_vcf
File input_vcf_index

String metrics_filename_prefix
File dbsnp_vcf
File dbsnp_vcf_index
Expand Down Expand Up @@ -848,3 +874,4 @@ task DynamicallyCombineIntervals {
File output_intervals = "out.intervals"
}
}
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Indel discovery in human whole-genome sequencing (WGS) data.
- BQSR Report
- Several Summary Metrics

### joint-discovery-gatk :
### JointGenotypingWf :
The second WDL implements the joint discovery and VQSR
filtering portion of the GATK Best Practices (June 2016) for germline SNP and Indel
discovery in human whole-genome sequencing (WGS) and exome sequencing data.
Expand All @@ -39,11 +39,11 @@ discovery in human whole-genome sequencing (WGS) and exome sequencing data.
- Summary Metrics

### Software version requirements :
- GATK 4.beta.3 or later
- GATK 4.beta.5 or later for PairedSingleSampleWF. GATK4.0.1.0 or later for JointGenotypingWf.
- Picard 2.x
- Samtools (see gotc docker)
- Python 2.7

Cromwell version support
- Successfully tested on v29
- Successfully tested on v31
- Does not work on versions < v23 due to output syntax

0 comments on commit 904564a

Please sign in to comment.