Skip to content

Commit

Permalink
Use Number=. for VCF genotype reserved key FT, fixes #1535.
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh authored and fnothaft committed May 31, 2017
1 parent c5b68d7 commit b7762c2
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class TransformGenotypesSuite extends ADAMFunSuite {
Array(intermediatePath, actualPath, "-sort_on_save", "-single")
).run(sc)

checkFiles(actualPath, expectedPath)
checkFiles(expectedPath, actualPath)
}

sparkTest("save a lexicographically sorted file") {
Expand All @@ -52,6 +52,6 @@ class TransformGenotypesSuite extends ADAMFunSuite {
Array(intermediatePath, actualPath, "-sort_lexicographically_on_save", "-single")
).run(sc)

checkFiles(actualPath, expectedPath)
checkFiles(expectedPath, actualPath)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ class TransformVariantsSuite extends ADAMFunSuite {
sparkTest("save a file sorted by contig index") {
val inputPath = copyResource("random.vcf")
val intermediatePath = tmpFile("variants.adam")
val actualPath = tmpFile("sorted.vcf")
val expectedPath = copyResource("sorted.vcf")
val actualPath = tmpFile("sorted-variants.vcf")
val expectedPath = copyResource("sorted-variants.vcf")

TransformVariants(
Array(inputPath, intermediatePath)
Expand All @@ -35,14 +35,14 @@ class TransformVariantsSuite extends ADAMFunSuite {
Array(intermediatePath, actualPath, "-sort_on_save", "-single")
).run(sc)

checkFiles(actualPath, expectedPath)
checkFiles(expectedPath, actualPath)
}

sparkTest("save a lexicographically sorted file") {
val inputPath = copyResource("random.vcf")
val intermediatePath = tmpFile("variants.lex.adam")
val actualPath = tmpFile("sorted.lex.vcf")
val expectedPath = copyResource("sorted.lex.vcf")
val actualPath = tmpFile("sorted-variants.lex.vcf")
val expectedPath = copyResource("sorted-variants.lex.vcf")

TransformVariants(
Array(inputPath, intermediatePath)
Expand All @@ -52,6 +52,6 @@ class TransformVariantsSuite extends ADAMFunSuite {
Array(intermediatePath, actualPath, "-sort_lexicographically_on_save", "-single")
).run(sc)

checkFiles(actualPath, expectedPath)
checkFiles(expectedPath, actualPath)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,7 @@ object DefaultHeaderLines {
VCFHeaderLineType.Float,
"Read-backed phasing quality")
lazy val genotypeFilter = new VCFFormatHeaderLine("FT",
//VCFHeaderLineCount.UNBOUNDED,
1,
VCFHeaderLineCount.UNBOUNDED,
VCFHeaderLineType.String,
"Genotype-level filter")
lazy val fisherStrand = new VCFFormatHeaderLine("FS",
Expand Down
2 changes: 1 addition & 1 deletion adam-core/src/test/resources/random.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
##FILTER=<ID=sx,Description="Heterozygous sex chromosomes in male sample, or Y chromosome in female sample">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=FT,Number=1,Type=String,Description="Genotype-level filter">
##FORMAT=<ID=FT,Number=.,Type=String,Description="Genotype-level filter">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
Expand Down
74 changes: 74 additions & 0 deletions adam-core/src/test/resources/sorted-variants.lex.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
##fileformat=VCFv4.2
##FILTER=<ID=IndelFS,Description="FS > 200.0">
##FILTER=<ID=IndelQD,Description="QD < 2.0">
##FILTER=<ID=IndelReadPosRankSum,Description="ReadPosRankSum < -20.0">
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=VQSRTrancheSNP99.50to99.60,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -0.5377 <= x < -0.1787">
##FILTER=<ID=VQSRTrancheSNP99.60to99.70,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1.0634 <= x < -0.5377">
##FILTER=<ID=VQSRTrancheSNP99.70to99.80,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1.7119 <= x < -1.0634">
##FILTER=<ID=VQSRTrancheSNP99.80to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -2.3301 <= x < -1.7119">
##FILTER=<ID=VQSRTrancheSNP99.90to99.95,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -2.8169 <= x < -2.3301">
##FILTER=<ID=VQSRTrancheSNP99.95to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -1918.1929">
##FILTER=<ID=VQSRTrancheSNP99.95to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -1918.1929 <= x < -2.8169">
##FILTER=<ID=dp,Description="Insufficient read depth">
##FILTER=<ID=gq,Description="Insufficient genotype quality">
##FILTER=<ID=rd,Description="Insufficient supporting reads">
##FILTER=<ID=sx,Description="Heterozygous sex chromosomes in male sample, or Y chromosome in female sample">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##FORMAT=<ID=FT,Number=.,Type=String,Description="Genotype-level filter">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the gVCF block">
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Root mean square (RMS) mapping quality">
##FORMAT=<ID=MQ0,Number=1,Type=Float,Description="Total number of reads with mapping quality=0">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PQ,Number=1,Type=Float,Description="Read-backed phasing quality">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set ID">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=CombineVariants,Version=2.7-63-gc434461,Date="Mon Oct 14 15:08:05 EDT 2013",Epoch=1381777685067,CommandLineOptions="analysis_type=CombineVariants input_file=[] read_buffer_size=null phone_home=NO_ET gatk_key=/packages/gatk/1.5-21-g979a84a/src/eugene.fluder_mssm.edu.key tag=NA read_filter=[] intervals=[/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/.queueScatterGather/.qlog/r1-1-1.combined.rawGT.vcf.combine-sg/temp_01_of_20/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/projects/ngs/resources/gatk/2.3/ucsc.hg19.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false logging_level=INFO log_to_file=null help=false version=false variant=[(RodBinding name=SNP source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.recal.SNP.vcf), (RodBinding name=Indel source=/gs01/projects/ngs/validation/exome/CEPHTrio/2.7/r1-1-1/r1-1-1.filt.IND.vcf)] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false setKey=null assumeIdenticalSamples=true minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
##INFO=<ID=1000G,Number=0,Type=Flag,Description="Membership in 1000 Genomes">
##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral allele">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count">
##INFO=<ID=AD,Number=R,Type=Integer,Description="Total read depths for each allele">
##INFO=<ID=ADF,Number=R,Type=Integer,Description="Read depths for each allele on the forward strand">
##INFO=<ID=ADR,Number=R,Type=Integer,Description="Read depths for each allele on the reverse strand">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele frequency for each allele">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=CIGAR,Number=A,Type=String,Description="Cigar string describing how to align alternate alleles to the reference allele">
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
##INFO=<ID=DB,Number=0,Type=Flag,Description="Membership in dbSNP">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=H2,Number=0,Type=Flag,Description="Membership in HapMap2">
##INFO=<ID=H3,Number=0,Type=Flag,Description="Membership in HapMap3">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic event">
##INFO=<ID=VALIDATED,Number=0,Type=Flag,Description="Validated by follow-up experiment">
##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=249250621>
##contig=<ID=13,length=249250621>
#CHROM POS ID REF ALT QUAL FILTER INFO
1 14397 . CTGT C . IndelQD AC=2;AF=0.333;AN=6;BaseQRankSum=1.8;ClippingRankSum=0.138;DP=69;FS=7.786;MLEAC=2;MLEAF=0.333;MQ=26.84;MQ0=0;MQRankSum=-1.906;QD=1.55;ReadPosRankSum=0.384
1 14522 . G A . VQSRTrancheSNP99.95to100.00 AC=2;AF=0.333;AN=6;BaseQRankSum=2.044;ClippingRankSum=-2.196;DP=48;FS=13.179;MLEAC=2;MLEAF=0.333;MQ=25.89;MQ0=0;MQRankSum=-0.063;QD=8.87;ReadPosRankSum=0.952;VQSLOD=-3.333;culprit=MQ
1 63735 rs201888535 CCTA C . PASS AC=1;AF=0.167;AN=6;BaseQRankSum=1.138;ClippingRankSum=0.448;DB;DP=176;FS=13.597;MLEAC=1;MLEAF=0.167;MQ=31.06;MQ0=0;MQRankSum=0.636;QD=9.98;ReadPosRankSum=-1.18
13 752721 rs3131972 A G . PASS AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD
13 752791 . A G . PASS AC=6;AF=1.0;AN=6;DB;DP=69;FS=0.0;MLEAC=6;MLEAF=1.0;MQ=60.0;MQ0=0;POSITIVE_TRAIN_SITE;QD=31.67;VQSLOD=18.94;culprit=QD
2 19190 . GC G . PASS AC=3;AF=0.5;AN=6;BaseQRankSum=4.157;ClippingRankSum=3.666;DP=74;FS=37.037;MLEAC=3;MLEAF=0.5;MQ=22.26;MQ0=0;MQRankSum=0.195;QD=16.04;ReadPosRankSum=-4.072
Loading

0 comments on commit b7762c2

Please sign in to comment.