From 0335bfd2fa3cf83c5f81c0dc5e71ac90b1567886 Mon Sep 17 00:00:00 2001 From: lichtens Date: Fri, 15 Sep 2017 14:52:29 -0400 Subject: [PATCH] Adding tool to annotate with pair orientation info Adding testing of f1r2 and f2r1 annotations. Fixed a lot of changes. Also made an optional output required in the WDL Fixed FilterByOrientationBias to accept the case where it only has one allele. That should not happen very often, but it will just do nothing. Made OxoGReadCounts no longer annotate a null readPileup. Beginning Indel support Basic functionality to do the F1R2 annotation of Indels. Added clunky indel support and testing. Fixed indel counts. Answering PR comments. Fleshing out the variant type functionality. Added one more small test. --- .../mutect2_wdl/unsupported/mutect2_opt.wdl | 4 +- .../tools/AnnotatePairOrientation.java | 114 ++++++++++++ .../OrientationBiasFilterer.java | 3 +- .../OrientationBiasUtils.java | 1 + .../walkers/annotator/OxoGReadCounts.java | 162 +++++++++++++++++- .../GATKProtectedVariantContextUtils.java | 116 ++++++++++++- ...nnotatePairOrientationIntegrationTest.java | 132 ++++++++++++++ ...KProtectedVariantContextUtilsUnitTest.java | 49 +++++- .../tools/test_no_pair_orientation_info.vcf | 64 +++++++ .../test_no_pair_orientation_info_indels.vcf | 67 ++++++++ 10 files changed, 699 insertions(+), 13 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientation.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientationIntegrationTest.java create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info.vcf create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info_indels.vcf diff --git a/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl b/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl index cc2885f3ad6..333ff3ce566 100755 --- a/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl +++ b/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl @@ -203,8 +203,8 @@ workflow Mutect2 { # select_first() fails if nothing resolves to non-null, so putting in "null" for now. File? oncotated_m2_maf = select_first([oncotate_m2.oncotated_m2_maf, "null"]) File? preadapter_detail_metrics = select_first([CollectSequencingArtifactMetrics.pre_adapter_metrics, "null"]) - File? bamout = select_first([MergeBamOuts.merged_bam_out, "null"]) - File? bamout_index = select_first([MergeBamOuts.merged_bam_out_index, "null"]) + File bamout = select_first([MergeBamOuts.merged_bam_out, "null"]) + File bamout_index = select_first([MergeBamOuts.merged_bam_out_index, "null"]) } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientation.java b/src/main/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientation.java new file mode 100644 index 00000000000..258a45b60de --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientation.java @@ -0,0 +1,114 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadsContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.VariantWalker; +import org.broadinstitute.hellbender.engine.filters.ReadFilter; +import org.broadinstitute.hellbender.tools.walkers.annotator.OxoGReadCounts; +import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine; +import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; +import org.broadinstitute.hellbender.utils.genotyper.SampleList; +import org.broadinstitute.hellbender.utils.pileup.ReadPileup; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; + +import java.io.File; +import java.util.*; + +@CommandLineProgramProperties( + summary = "(Experimental) This adds fields normally emitted by M2 to a VCF. There should never be a need to run this tool on a VCF that was produced by M2." + + "\n The output of this tool should be usable with FilterByOrientationBias." + + "\n The output of this tool only counts reads that fully overlap (and match) the variant or reference sequence (this is relevant for indels)." + + "\n IMPORTANT: This tool does not produce the exact same F1R2/F2R1 as M2, due to the nature of how M2 calls variants (using read likelihoods, whereas this tool uses a base quality filter).", + oneLineSummary = "(EXPERIMENTAL) Annotate a non-M2 VCF (using the associated tumor bam) with pair orientation fields (e.g. " + GATKVCFConstants.F1R2_KEY + " ).", + programGroup = VariantProgramGroup.class +) +@BetaFeature +public class AnnotatePairOrientation extends VariantWalker { + + @Argument( + doc = "Output Somatic SNP/Indel VCF file with additional annotations.", + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME) + protected File outputFile; + + public final static String CUTOFF_SHORT_NAME = "cutoff"; + public final static String CUTOFF_LONG_NAME = "min-base-quality-cutoff"; + public final static int MIN_BASE_QUALITY_DEFAULT_CUTOFF = 7; + @Argument( + doc = "Cutoff for the min base quality value(s) to count the read. These are for bases that overlap the variant.", + shortName = CUTOFF_SHORT_NAME, fullName = CUTOFF_LONG_NAME, minValue = 0, maxRecommendedValue = 20, + optional = true + ) + private int minBaseQualityCutoff = MIN_BASE_QUALITY_DEFAULT_CUTOFF; + + private VariantContextWriter vcfWriter; + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputFile); + vcfWriter.writeHeader(createVCFHeader(getHeaderForVariants(), getCommandLine())); + } + + @Override + public List getDefaultReadFilters() { + return Mutect2Engine.makeStandardMutect2ReadFilters(); + } + + @Override + public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + + final ReadPileup readPileup = GATKProtectedVariantContextUtils.getPileup(variant, readsContext); + final List updatedGenotypes = new ArrayList<>(); + + final Map sampleToReadPileup = readPileup.splitBySample(getHeaderForReads(), null); + + for (Genotype g : variant.getGenotypes()) { + final ReadPileup genotypeSamplePileup = sampleToReadPileup.get(g.getSampleName()); + final GenotypeBuilder gb = new GenotypeBuilder(g); + OxoGReadCounts.annotateSingleVariant(variant, gb, genotypeSamplePileup, minBaseQualityCutoff); + updatedGenotypes.add(gb.make()); + } + + vcfWriter.add(new VariantContextBuilder(variant).genotypes(updatedGenotypes).make()); + } + + @Override + public boolean requiresReads() { + return true; + } + + private static VCFHeader createVCFHeader(final VCFHeader inputVCFHeader, final String commandLine) { + Utils.nonNull(inputVCFHeader); + + // Setup header for output file + final Set headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder()); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F1R2_KEY)); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F2R1_KEY)); + headerLines.add(new VCFHeaderLine("command", commandLine)); + final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); + return new VCFHeader(headerLines, samples.asSetOfSamples()); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java index f2508827926..8a9ce4d7c12 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java @@ -14,7 +14,6 @@ import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; import org.broadinstitute.hellbender.utils.genotyper.SampleList; import org.broadinstitute.hellbender.utils.param.ParamUtils; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; import java.util.*; @@ -67,7 +66,7 @@ public static VariantContext annotateVariantContextWithPreprocessingValues(final // Get the reference allele as a String and make sure that there is only one ref allele and that it is length // one, which would indicate that it could be a part of a SNP/SNV final List refAlleles = alleles.stream().filter(a -> a.isReference()).map(a -> a.getBaseString()).collect(Collectors.toList()); - if (((refAlleles.size() == 1) && (refAlleles.get(0).length() == 1))) { + if (((refAlleles.size() == 1) && (refAlleles.get(0).length() == 1)) && alleles.size() > 1) { final Character refAllele = (char) refAlleles.get(0).getBytes()[0]; // Since we only look at the first alt allele on a site, we do not need a for loop over all non-ref alleles, e.g. for (int i = 1; i < alleles.size(); i++) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasUtils.java index 487fa16a8dc..a15d951959b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasUtils.java @@ -268,6 +268,7 @@ public static long calculateUnfilteredNonRefGenotypeCount(final List !g.isFiltered()) + .filter(g -> g.getAlleles().size() > 1) .filter(g -> !g.getAllele(0).basesMatch(g.getAllele(1))) .count(); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OxoGReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OxoGReadCounts.java index 5c5882dad16..6b762608e8c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OxoGReadCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OxoGReadCounts.java @@ -6,18 +6,23 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; import org.apache.commons.lang.mutable.MutableInt; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; import org.broadinstitute.hellbender.utils.QualityUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.hellbender.utils.pileup.PileupElement; +import org.broadinstitute.hellbender.utils.pileup.ReadPileup; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; -import java.util.Arrays; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; +import java.util.stream.IntStream; /** @@ -34,6 +39,8 @@ */ public final class OxoGReadCounts extends GenotypeAnnotation implements StandardMutectAnnotation { + private static final Logger logger = LogManager.getLogger(OxoGReadCounts.class); + @Override public List getKeyNames() { return Arrays.asList(GATKVCFConstants.F1R2_KEY, GATKVCFConstants.F2R1_KEY); @@ -77,6 +84,155 @@ public void annotate(final ReferenceContext refContext, gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1); } + /** + * Annotate the given variant context with the OxoG read count attributes, directly from the read pileup. + * + * This method may be slow and should be considered EXPERIMENTAL, especially with regard to indels and complex/mixed + * variants. + * + * @param vc variant context for the genotype. Necessary so that we can see all alleles. + * @param gb genotype builder to put the annotations into. + * @param readPileup pileup of the reads at this vc. Note that this pileup does not have to match the + * genotype. In other words, this tool does not check that the pileup was generated from the + * genotype sample. + */ + public static void annotateSingleVariant(final VariantContext vc, final GenotypeBuilder gb, + final ReadPileup readPileup, int meanBaseQualityCutoff) { + Utils.nonNull(gb, "gb is null"); + Utils.nonNull(vc, "vc is null"); + + // Create a list of unique alleles + final List variantAllelesWithDupes = vc.getAlleles(); + final Set alleleSet = new LinkedHashSet<>(variantAllelesWithDupes); + final List variantAlleles = new ArrayList<>(alleleSet); + + // Initialize the mappings + final Map f1r2Counts = variantAlleles.stream() + .collect(Collectors.toMap(a -> a, a -> new MutableInt(0))); + + final Map f2r1Counts = variantAlleles.stream() + .collect(Collectors.toMap(a -> a, a -> new MutableInt(0))); + + final List referenceAlleles = variantAlleles.stream().filter(a -> a.isReference() && !a.isSymbolic()).collect(Collectors.toList()); + final List altAlleles = variantAlleles.stream().filter(a -> a.isNonReference() && !a.isSymbolic()).collect(Collectors.toList()); + + if (referenceAlleles.size() != 1) { + logger.warn("Number of reference alleles does not equal for VC: " + vc); + } + + // We MUST have exactly 1 non-symbolic reference allele and a read pileup, + if ((referenceAlleles.size() == 1) && (readPileup != null) && !referenceAlleles.get(0).isSymbolic()) { + final Allele referenceAllele = referenceAlleles.get(0); + Utils.stream(readPileup) + .filter(pe -> isUsableRead(pe.getRead())) + .forEach(pe -> incrementCounts(pe, f1r2Counts, f2r1Counts, referenceAllele, altAlleles, meanBaseQualityCutoff)); + } + + final int[] f1r2 = variantAlleles.stream().mapToInt(a -> f1r2Counts.get(a).intValue()).toArray(); + + final int[] f2r1 = variantAlleles.stream().mapToInt(a -> f2r1Counts.get(a).intValue()).toArray(); + + gb.attribute(GATKVCFConstants.F1R2_KEY, f1r2); + gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1); + } + + /** + * If the allele is not in the count mappings, then it is not counted. No exception will be thrown + * Modifies count variables in place. + * + * @param pileupElement pileup overlapping the alleles + * @param f1r2Counts a mapping of allele to f1r2 counts + * @param f2r1Counts a mapping of allele to f2r1 counts + */ + private static void incrementCounts(final PileupElement pileupElement, final Map f1r2Counts, + final Map f2r1Counts, final Allele referenceAllele, + final List altAlleles, int minBaseQualityCutoff) { + + final Map countMap = isF2R1(pileupElement.getRead()) ? f2r1Counts : f1r2Counts; + + final boolean isRef = referenceAllele.basesMatch(getBasesForAlleleInRead(pileupElement, referenceAllele)) + && !pileupElement.isBeforeDeletionStart() && !pileupElement.isBeforeInsertion(); + + Allele pileupAllele = null; + if (!isRef) { + + for (Allele altAllele : altAlleles) { + final VariantContext.Type variantType = GATKProtectedVariantContextUtils.typeOfVariant(referenceAllele, altAllele); + + if (variantType == VariantContext.Type.INDEL) { + if (isIndelInThePileupElement(pileupElement, referenceAllele, altAllele)) { + pileupAllele = altAllele; + } + + } else if (variantType == VariantContext.Type.MNP || variantType == VariantContext.Type.SNP) { + if (altAllele.basesMatch(getBasesForAlleleInRead(pileupElement, altAllele))) { + pileupAllele = altAllele; + } + } + + } + + } else { + pileupAllele = referenceAllele; + } + + if (pileupAllele == null) { + return; + } + + if (getMinBaseQualityForAlleleInRead(pileupElement, pileupAllele) < minBaseQualityCutoff) { + return; + } + + if (countMap.containsKey(pileupAllele)) { + countMap.get(pileupAllele).increment(); + } + } + + private static boolean isIndelInThePileupElement(final PileupElement pileupElement, final Allele referenceAllele, final Allele altAllele) { + boolean isAltAlleleInThePileup = false; + + // Check insertion + if (pileupElement.isBeforeInsertion()) { + final int insertionLength = pileupElement.getLengthOfImmediatelyFollowingIndel(); + if (insertionLength == pileupElement.getLengthOfImmediatelyFollowingIndel()) { + final String insertionBases = pileupElement.getBasesOfImmediatelyFollowingInsertion(); + // edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB] + if (insertionBases != null) { + final boolean isMatch = Allele.extend(referenceAllele, insertionBases.getBytes()).basesMatch(altAllele); + if (isMatch) { + isAltAlleleInThePileup = true; + } + } + } + } + + // Check deletion + if (pileupElement.isBeforeDeletionStart()) { + final int deletionLength = pileupElement.getLengthOfImmediatelyFollowingIndel(); + if ((referenceAllele.getBases().length - altAllele.getBases().length) == deletionLength) { + isAltAlleleInThePileup = true; + } + } + return isAltAlleleInThePileup; + } + + private static byte[] getBasesForAlleleInRead(final PileupElement pileupElement, final Allele allele) { + return ArrayUtils.subarray(pileupElement.getRead().getBases(), pileupElement.getOffset(), pileupElement.getOffset() + allele.getBases().length); + } + + private static int getMinBaseQualityForAlleleInRead(final PileupElement pileupElement, final Allele allele) { + final byte[] alleleBases = allele.getBases(); + final byte[] pileupBaseQualities = ArrayUtils.subarray(pileupElement.getRead().getBaseQualities(), pileupElement.getOffset(), pileupElement.getOffset() + alleleBases.length); + final OptionalInt minQuality = IntStream.range(0, pileupBaseQualities.length).map(i -> Byte.toUnsignedInt(pileupBaseQualities[i])).min(); + if (!minQuality.isPresent()) { + return -1; + } else { + return minQuality.getAsInt(); + } + } + + protected static boolean isUsableRead(final GATKRead read) { return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java index 1c5442bf3a2..5e3d62cf0bb 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java @@ -1,11 +1,11 @@ package org.broadinstitute.hellbender.utils; import htsjdk.samtools.util.Locatable; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; -import org.broadinstitute.hellbender.engine.ReadsContext; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine; @@ -14,7 +14,6 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import java.lang.reflect.Array; -import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.function.Function; @@ -22,6 +21,7 @@ import java.util.function.ToDoubleFunction; import java.util.function.ToIntFunction; import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.Stream; import java.util.stream.StreamSupport; @@ -290,4 +290,116 @@ public static ReadPileup getPileup(final Locatable loc, final Iterable return new ReadPileup(loc, pile); } + + /** This is lifted directly from htsjdk with some minor modifications! However, it is a private method there. + * + * This method cannot return {@link VariantContext.Type} MIXED + * + * Please see https://github.com/samtools/htsjdk/issues/999 + * + *

Here are some cases that will not work properly, though this may not be an issue in practice:

+ *
    + *
  • "CGT" --> "GGA" this will be a MNP, but really it is two SNPs.
  • + *
  • Spanning deletions for alternate will show as {@link VariantContext.Type} NO_VARIATION
  • + *
  • Spanning deletions for reference will throw exception.
  • + *
  • Reference that is symbolic will throw an exception.
  • + *
+ * + * @param ref reference allele. Never {@code null} + * @param allele alternate allele to compare. Never {@code null} + * @return + */ + public static VariantContext.Type typeOfVariant(final Allele ref, final Allele allele) { + Utils.nonNull(ref); + Utils.nonNull(allele); + + if ( ref.isSymbolic() ) + throw new IllegalStateException("Unexpected error: encountered a record with a symbolic reference allele"); + + if ( allele.isSymbolic() ) + return VariantContext.Type.SYMBOLIC; + + if (allele.equals(Allele.SPAN_DEL)) { + return VariantContext.Type.NO_VARIATION; + } + + if ( ref.equals(Allele.SPAN_DEL) ) + throw new IllegalStateException("Unexpected error: encountered a record with a spanning deletion reference allele"); + + if ( ref.length() == allele.length() ) { + if (ref.basesMatch(allele)) { + return VariantContext.Type.NO_VARIATION; + } else if ( allele.length() == 1 ) + return VariantContext.Type.SNP; + + // If the two alleles are the same length and only differ by one base, then still a SNP. + else if (IntStream.range(0, ref.length()).filter(i -> ref.getBases()[i] != allele.getBases()[i]).count() == 1) { + return VariantContext.Type.SNP; + } else + return VariantContext.Type.MNP; + } + + // Important note: previously we were checking that one allele is the prefix of the other. However, that's not an + // appropriate check as can be seen from the following example: + // REF = CTTA and ALT = C,CT,CA + // This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check. + // In truth, it should be absolutely impossible to return a MIXED type from this method because it simply + // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type + // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point + // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. + + return VariantContext.Type.INDEL; + + // old incorrect logic: + // if (oneIsPrefixOfOther(ref, allele)) + // return Type.INDEL; + // else + // return Type.MIXED; + } + + /** + * This method should only be run on variants that are known to be indels. See {@link GATKProtectedVariantContextUtils::typeOfVariant} + * + *

Here are some cases that will not work properly, though this may not be an issue in practice:

+ *
    + *
  • "CT" --> "CATT" this is really just a simple AT insertion, but this will show up as complex.
  • + *
+ * @param ref reference allele. Never {@code null} + * @param allele alternate allele to compare. Never {@code null} + * @return true if the indel is complex (for example, also includes a SNP), false if simple indel. If the input alleles define a variant that is not + * an indel, then the behavior of this method is undefined (though will probably just return false). + * + */ + public static boolean isComplexIndel(final Allele ref, final Allele allele) { + + Utils.nonNull(ref); + Utils.nonNull(allele); + + // Symbolic --> false + if (ref.isSymbolic() || (ref.length() == 0)) { + return false; + } + if (allele.isSymbolic() || (allele.length() == 0)) { + return false; + } + + // SNP, MNP, or no variation --> false + if ( ref.length() == allele.length() ) { + return false; + } + + // obvious simple del or simple indel + if ((allele.length() == 1) || (ref.length() == 1)) { + return false; + } + + // If the ref starts with the alt or vice versa, this is still simple. + if (allele.length() > ref.length()) { + final boolean isAltStartsWithRef = IntStream.range(0, ref.length()).allMatch(i -> ref.getBases()[i] == allele.getBases()[i]); + return !isAltStartsWithRef; + } else { + final boolean isRefStartsWithAlt = IntStream.range(0, allele.length()).allMatch(i -> ref.getBases()[i] == allele.getBases()[i]); + return !isRefStartsWithAlt; + } + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientationIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientationIntegrationTest.java new file mode 100644 index 00000000000..70e4af93f1c --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientationIntegrationTest.java @@ -0,0 +1,132 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureDataSource; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.nd4j.linalg.io.StringUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class AnnotatePairOrientationIntegrationTest extends CommandLineProgramTest { + private static final File TEST_VCF_DIR = new File("src/test/resources/org/broadinstitute/hellbender/tools/"); + private static final File TEST_BAM_DIR = new File("src/test/resources/large/mutect/dream_synthetic_bams/"); + + final static String TEST_VCF = TEST_VCF_DIR.getAbsolutePath() + "/test_no_pair_orientation_info.vcf"; + final static String TEST_VCF_INDELS = TEST_VCF_DIR.getAbsolutePath() + "/test_no_pair_orientation_info_indels.vcf"; + final static String TEST_BAM_TUMOR = TEST_BAM_DIR.getAbsolutePath() + "/tumor_1.bam"; + final static String TEST_BAM_NORMAL = TEST_BAM_DIR.getAbsolutePath() + "/normal_1.bam"; + final static String TEST_BAM_TUMOR_INDELS = TEST_BAM_DIR.getAbsolutePath() + "/tumor_3.bam"; + final static String TEST_BAM_NORMAL_INDELS = TEST_BAM_DIR.getAbsolutePath() + "/normal_3.bam"; + + // TODO: Test with multiallelics + // TODO: Test with multiallelics and symbolic at the same time + // TODO: Test with symbolic + // TODO: Test with information missing from the VCF and make sure appropriate exception is thrown. + // TODO: Test with more cutoff variables + // TODO: Once above five TODOs are done (at least), AnnotatePairOrientation can be taken out of Experimental status. + + + @Test + public void testBasicIndels() throws IOException { + final File outputFile = File.createTempFile("ob_indel_annotate_", ".vcf"); + final List arguments = new ArrayList<>(); + arguments.add("-" + StandardArgumentDefinitions.VARIANT_SHORT_NAME); + arguments.add(TEST_VCF_INDELS); + arguments.add("-" + StandardArgumentDefinitions.INPUT_SHORT_NAME); + arguments.add(TEST_BAM_TUMOR_INDELS); + arguments.add("-" + StandardArgumentDefinitions.INPUT_SHORT_NAME); + arguments.add(TEST_BAM_NORMAL_INDELS); + + + arguments.add("-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME); + arguments.add(outputFile.getAbsolutePath()); + runCommandLine(arguments); + + // Ground truth from manual review in IGV + final String[][] gtF1R2F2R1 = {{"14,0", "21,0", "11,3", "13,4"},{"34,0","27,0", "10,12","14,11"}, + {"14,0", "14,0", "18,1", "17,3"},{"24,0","15,0", "19,7","22,2"}}; + + Assert.assertTrue(outputFile.exists()); + final List variantContexts = getVariantContextsFromFile(outputFile); + + assertOrientationAnnotationValues(variantContexts, gtF1R2F2R1, "G15512.prenormal.sorted", + "IS3.snv.indel.sv"); + } + + /** + * Only tests SNVs + * @throws IOException + */ + @Test + public void testBasicRun() throws IOException{ + final File outputFile = File.createTempFile("ob_annotate_", ".vcf"); + final List arguments = new ArrayList<>(); + arguments.add("-" + StandardArgumentDefinitions.VARIANT_SHORT_NAME); + arguments.add(TEST_VCF); + arguments.add("-" + StandardArgumentDefinitions.INPUT_SHORT_NAME); + arguments.add(TEST_BAM_TUMOR); + arguments.add("-" + StandardArgumentDefinitions.INPUT_SHORT_NAME); + arguments.add(TEST_BAM_NORMAL); + + arguments.add("-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME); + arguments.add(outputFile.getAbsolutePath()); + runCommandLine(arguments); + + Assert.assertTrue(outputFile.exists()); + final List variantContexts = getVariantContextsFromFile(outputFile); + + // Ground truth from manual review in IGV + final String[][] gtF1R2F2R1 = {{"22,0", "11,0", "9,9", "8,4"},{"11,0","15,0", "11,8","9,10"}}; + + assertOrientationAnnotationValues(variantContexts, gtF1R2F2R1, "synthetic.challenge.set1.normal", + "synthetic.challenge.set1.tumor"); + } + + private List getVariantContextsFromFile(File vcfFile) { + final List variantContexts = new ArrayList<>(); + final FeatureDataSource featureDataSource = new FeatureDataSource<>(vcfFile); + for (final VariantContext vc : featureDataSource) { + variantContexts.add(vc); + } + return variantContexts; + } + + private void assertOrientationAnnotationValues(final List variantContexts, final String[][] gtF1R2F2R1, + final String normalSampleName, final String tumorSampleName) { + final List annotations = new ArrayList<>(); + annotations.add(GATKVCFConstants.F1R2_KEY); + annotations.add(GATKVCFConstants.F2R1_KEY); + + for (int i = 0; i < variantContexts.size(); i++) { + final VariantContext vc = variantContexts.get(i); + final Genotype normalGenotype = vc.getGenotype(normalSampleName); + Assert.assertTrue(normalGenotype.hasExtendedAttribute(GATKVCFConstants.F1R2_KEY)); + Assert.assertTrue(normalGenotype.hasExtendedAttribute(GATKVCFConstants.F2R1_KEY)); + + for (int j = 0; j < annotations.size(); j ++) { + final String annotation = annotations.get(j); + final String normalF1r2 = normalGenotype.getExtendedAttribute(annotation).toString(); + Assert.assertEquals(normalF1r2, gtF1R2F2R1[i][j]); + } + + final Genotype tumorGenotype = vc.getGenotype(tumorSampleName); + Assert.assertTrue(tumorGenotype.hasExtendedAttribute(GATKVCFConstants.F1R2_KEY)); + Assert.assertTrue(tumorGenotype.hasExtendedAttribute(GATKVCFConstants.F2R1_KEY)); + for (int j = 0; j < annotations.size(); j ++) { + final String annotation = annotations.get(j); + final String tumorF1r2 = tumorGenotype.getExtendedAttribute(annotation).toString(); + Assert.assertNotNull(StringUtils.split(tumorF1r2, ",")); + Assert.assertEquals(StringUtils.split(tumorF1r2, ",").length, 2); + Assert.assertEquals(tumorF1r2, gtF1R2F2R1[i][j+annotations.size()]); + } + } + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtilsUnitTest.java index b37ba5698e6..307a5c6b8ed 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtilsUnitTest.java @@ -2,18 +2,18 @@ import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.util.Locatable; -import org.broadinstitute.hellbender.engine.ReadsContext; -import org.broadinstitute.hellbender.engine.ReadsDataSource; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; import org.broadinstitute.hellbender.utils.pileup.ReadPileup; import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.Arrays; -import static org.testng.Assert.*; - /** * Created by David Benjamin on 2/15/17. */ @@ -55,4 +55,45 @@ public void testGetPileup() { Assert.assertEquals(counts, new int[]{1, 1, 0, 0}); } + + @Test(dataProvider = "variantTypes") + public void testVariantTypesAndIsComplex(final String ref, final String alt, final VariantContext.Type gtType, boolean isComplexIndel) { + Assert.assertEquals(GATKProtectedVariantContextUtils.typeOfVariant(Allele.create(ref), Allele.create(alt)), gtType); + Assert.assertEquals(GATKProtectedVariantContextUtils.isComplexIndel(Allele.create(ref), Allele.create(alt)), isComplexIndel); + } + @Test(expectedExceptions = IllegalStateException.class) + public void testSymbolicRef() { + GATKProtectedVariantContextUtils.typeOfVariant(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE, Allele.create("C")); + } + + @DataProvider(name = "variantTypes") + public Object[][] variantTypes() { + return new Object[][]{ + // ref, alt, type, isComplex? + {"CCTTGGCTTATTCCA", "C", VariantContext.Type.INDEL, false}, + {"C", "CCTTGGCTTATTCCA", VariantContext.Type.INDEL, false}, + {"ACTAG", "A", VariantContext.Type.INDEL, false}, + {"ATT", "AT", VariantContext.Type.INDEL, false}, + {"AT", "ATT", VariantContext.Type.INDEL, false}, + {"CT", "CAGG", VariantContext.Type.INDEL, true}, + {"CTTT", "CAGG", VariantContext.Type.MNP, false}, + {"CTTT", "CAGGG", VariantContext.Type.INDEL, true}, + {"T", "T", VariantContext.Type.NO_VARIATION, false}, + {"CTAG", "CTAG", VariantContext.Type.NO_VARIATION, false}, + {"A", "AAGAAGCATGC", VariantContext.Type.INDEL, false}, + {"A", "C", VariantContext.Type.SNP, false}, + {"AG", "CA", VariantContext.Type.MNP, false}, + {"AGAAGG", "CATTCC", VariantContext.Type.MNP, false}, + {"GC", "GA", VariantContext.Type.SNP, false}, + {"GA", "", VariantContext.Type.SYMBOLIC, false}, + {"GA", "*", VariantContext.Type.NO_VARIATION, false}, + + // There are two MNPs here + {"AGAAGG", "CATACC", VariantContext.Type.MNP, false}, + + // Note that this is technically a simple AT insertion, but the isComplex cannot handle this properly. + {"CT", "CATT", VariantContext.Type.INDEL, true}, + }; + } + } \ No newline at end of file diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info.vcf new file mode 100644 index 00000000000..f08787da5a7 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info.vcf @@ -0,0 +1,64 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##Mutect Version=2.1-beta +##command=FilterByOrientationBias --output synthetic.challenge.set1.tumor-vs-synthetic.challenge.set1.normal-filtered.vcf --preAdapterDetailFile /cromwell-executions/Mutect2_Multi/9bd645eb-e2c9-41f6-88d2-a61013b9e65c/call-Mutect2/shard-0/Mutect2/ead27398-13e8-48fc-9050-67633fe2d6d2/call-Filter/inputs/home/lichtens/test_bamout/cromwell-executions/Mutect2_Multi/9bd645eb-e2c9-41f6-88d2-a61013b9e65c/call-Mutect2/shard-0/Mutect2/ead27398-13e8-48fc-9050-67633fe2d6d2/call-CollectSequencingArtifactMetrics/execution/gatk.pre_adapter_detail_metrics --artifactModes G/T --artifactModes C/T --variant filtered.vcf --interval_set_rule UNION --interval_padding 0 --interval_exclusion_padding 0 --interval_merging_rule ALL --readValidationStringency SILENT --secondsBetweenProgressUpdates 10.0 --disableSequenceDictionaryValidation false --createOutputBamIndex true --createOutputBamMD5 false --createOutputVariantIndex true --createOutputVariantMD5 false --lenient false --addOutputSAMProgramRecord true --addOutputVCFCommandLine true --cloudPrefetchBuffer 40 --cloudIndexPrefetchBuffer -1 --disableBamIndexCaching false --help false --version false --showHidden false --verbosity INFO --QUIET false --use_jdk_deflater false --use_jdk_inflater false --gcs_max_retries 20 --disableToolDefaultReadFilters false +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=synthetic.challenge.set1.normal +##orientation_bias_artifact_modes= +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=synthetic.challenge.set1.tumor +##Manually removed the F1R2 and F2R1 from FORMAT and header +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT synthetic.challenge.set1.normal synthetic.challenge.set1.tumor +20 577548 . C G . PASS DP=55;ECNT=1;NLOD=8.72;N_ART_LOD=-1.483e+00;POP_AF=1.000e-03;P_GERMLINE=-5.423e+00;TLOD=25.47 GT:AD:AF:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:OBF:OBP:OBQ:OBQRC:SA_MAP_AF:SA_POST_PROB 0/0:29,0:0.017:0:339,0:0:0:false:false 0/1:14,10:0.420:30:314,325:60:20:false:false:.:.:45.99:100.00:0.394,0.384,0.417:0.021,0.024,0.955 +20 1838610 . T A . PASS DP=64;ECNT=1;NLOD=7.82;N_ART_LOD=-1.438e+00;POP_AF=1.000e-03;P_GERMLINE=-4.521e+00;TLOD=50.09 GT:AD:AF:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:OBF:OBP:OBQ:OBQRC:SA_MAP_AF:SA_POST_PROB 0/0:26,0:0.019:0:338,0:0:0:false:false 0/1:20,17:0.460:32:336,340:60:20:false:false:.:.:100.00:49.80:0.444,0.404,0.459:0.017,0.032,0.951 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info_indels.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info_indels.vcf new file mode 100644 index 00000000000..64483991374 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info_indels.vcf @@ -0,0 +1,67 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##Mutect Version=2.1-beta +##command=FilterByOrientationBias --output IS3.snv.indel.sv-vs-G15512.prenormal.sorted-filtered.vcf --preAdapterDetailFile /cromwell-executions/Mutect2_Multi/91355e3c-342f-4c9d-8ea6-e70a61faca94/call-Mutect2/shard-2/Mutect2/a727615d-d7b1-4f3b-8ed0-48f4c1920eca/call-Filter/inputs/home/lichtens/test_bamout/cromwell-executions/Mutect2_Multi/91355e3c-342f-4c9d-8ea6-e70a61faca94/call-Mutect2/shard-2/Mutect2/a727615d-d7b1-4f3b-8ed0-48f4c1920eca/call-CollectSequencingArtifactMetrics/execution/gatk.pre_adapter_detail_metrics --artifactModes G/T --artifactModes C/T --variant filtered.vcf --interval_set_rule UNION --interval_padding 0 --interval_exclusion_padding 0 --interval_merging_rule ALL --readValidationStringency SILENT --secondsBetweenProgressUpdates 10.0 --disableSequenceDictionaryValidation false --createOutputBamIndex true --createOutputBamMD5 false --createOutputVariantIndex true --createOutputVariantMD5 false --lenient false --addOutputSAMProgramRecord true --addOutputVCFCommandLine true --cloudPrefetchBuffer 40 --cloudIndexPrefetchBuffer -1 --disableBamIndexCaching false --help false --version false --showHidden false --verbosity INFO --QUIET false --use_jdk_deflater false --use_jdk_inflater false --gcs_max_retries 20 --disableToolDefaultReadFilters false +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=G15512.prenormal.sorted +##orientation_bias_artifact_modes= +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=IS3.snv.indel.sv +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G15512.prenormal.sorted IS3.snv.indel.sv +20 1330646 . CCTTGGCTTATTCCA C . PASS DP=85;ECNT=1;NLOD=14.42;N_ART_LOD=-1.693e+00;POP_AF=1.000e-03;P_GERMLINE=-1.112e+01;TLOD=35.03 GT:AD:AF:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:SA_MAP_AF:SA_POST_PROB 0/0:48,0:0.011:19,0:29,0:0:348,0:0:0:false:false 0/1:26,10:0.278:11,5:15,5:29:349,348:60:20:false:false:0.253,0.253,0.278:0.017,0.021,0.962 +20 2038732 . ACTAG A . PASS DP=123;ECNT=1;NLOD=19.82;N_ART_LOD=-1.842e+00;POP_AF=1.000e-03;P_GERMLINE=-1.652e+01;TLOD=96.69 GT:AD:AF:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:SA_MAP_AF:SA_POST_PROB 0/0:41,0:0.043:21,0:20,0:0:345,0:0:0:false:false 0/1:35,8:0.193:23,3:12,5:29:345,338:60:33:false:false:0.172,0.141,0.186:9.465e-03,0.036,0.955 +20 3076247 . AT ATT . t_lod DP=72;ECNT=2;NLOD=7.02;N_ART_LOD=-1.161e+00;POP_AF=1.000e-03;P_GERMLINE=-5.885e+00;RPA=14,15;RU=T;STR;TLOD=3.83 GT:AD:AF:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:SA_MAP_AF:SA_POST_PROB 0/0:24,0:0.140:13,0,0:9,2,0:0:342,0:0:0:false:false 0/1:25,4:0.243:13,1,0:10,1,4:30:341,334:60:24:false:false:0.121,0.111,0.138:0.014,0.019,0.967 +20 3076299 . A AAGAAGCATGC . PASS DP=90;ECNT=2;NLOD=10.86;N_ART_LOD=-1.579e+00;POP_AF=1.000e-03;P_GERMLINE=-7.558e+00;TLOD=41.31 GT:AD:AF:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:SA_MAP_AF:SA_POST_PROB 0/0:36,0:0.026:22,0:14,0:0:340,0:0:0:false:false 0/1:37,12:0.259:17,8:20,4:29:343,342:60:31:false:false:0.192,0.232,0.245:0.035,0.011,0.955