diff --git a/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl b/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl index cc2885f3ad6..333ff3ce566 100755 --- a/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl +++ b/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl @@ -203,8 +203,8 @@ workflow Mutect2 { # select_first() fails if nothing resolves to non-null, so putting in "null" for now. File? oncotated_m2_maf = select_first([oncotate_m2.oncotated_m2_maf, "null"]) File? preadapter_detail_metrics = select_first([CollectSequencingArtifactMetrics.pre_adapter_metrics, "null"]) - File? bamout = select_first([MergeBamOuts.merged_bam_out, "null"]) - File? bamout_index = select_first([MergeBamOuts.merged_bam_out_index, "null"]) + File bamout = select_first([MergeBamOuts.merged_bam_out, "null"]) + File bamout_index = select_first([MergeBamOuts.merged_bam_out_index, "null"]) } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientation.java b/src/main/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientation.java new file mode 100644 index 00000000000..258a45b60de --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientation.java @@ -0,0 +1,114 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadsContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.engine.VariantWalker; +import org.broadinstitute.hellbender.engine.filters.ReadFilter; +import org.broadinstitute.hellbender.tools.walkers.annotator.OxoGReadCounts; +import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine; +import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; +import org.broadinstitute.hellbender.utils.genotyper.SampleList; +import org.broadinstitute.hellbender.utils.pileup.ReadPileup; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; + +import java.io.File; +import java.util.*; + +@CommandLineProgramProperties( + summary = "(Experimental) This adds fields normally emitted by M2 to a VCF. There should never be a need to run this tool on a VCF that was produced by M2." + + "\n The output of this tool should be usable with FilterByOrientationBias." + + "\n The output of this tool only counts reads that fully overlap (and match) the variant or reference sequence (this is relevant for indels)." + + "\n IMPORTANT: This tool does not produce the exact same F1R2/F2R1 as M2, due to the nature of how M2 calls variants (using read likelihoods, whereas this tool uses a base quality filter).", + oneLineSummary = "(EXPERIMENTAL) Annotate a non-M2 VCF (using the associated tumor bam) with pair orientation fields (e.g. " + GATKVCFConstants.F1R2_KEY + " ).", + programGroup = VariantProgramGroup.class +) +@BetaFeature +public class AnnotatePairOrientation extends VariantWalker { + + @Argument( + doc = "Output Somatic SNP/Indel VCF file with additional annotations.", + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME) + protected File outputFile; + + public final static String CUTOFF_SHORT_NAME = "cutoff"; + public final static String CUTOFF_LONG_NAME = "min-base-quality-cutoff"; + public final static int MIN_BASE_QUALITY_DEFAULT_CUTOFF = 7; + @Argument( + doc = "Cutoff for the min base quality value(s) to count the read. These are for bases that overlap the variant.", + shortName = CUTOFF_SHORT_NAME, fullName = CUTOFF_LONG_NAME, minValue = 0, maxRecommendedValue = 20, + optional = true + ) + private int minBaseQualityCutoff = MIN_BASE_QUALITY_DEFAULT_CUTOFF; + + private VariantContextWriter vcfWriter; + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputFile); + vcfWriter.writeHeader(createVCFHeader(getHeaderForVariants(), getCommandLine())); + } + + @Override + public List getDefaultReadFilters() { + return Mutect2Engine.makeStandardMutect2ReadFilters(); + } + + @Override + public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + + final ReadPileup readPileup = GATKProtectedVariantContextUtils.getPileup(variant, readsContext); + final List updatedGenotypes = new ArrayList<>(); + + final Map sampleToReadPileup = readPileup.splitBySample(getHeaderForReads(), null); + + for (Genotype g : variant.getGenotypes()) { + final ReadPileup genotypeSamplePileup = sampleToReadPileup.get(g.getSampleName()); + final GenotypeBuilder gb = new GenotypeBuilder(g); + OxoGReadCounts.annotateSingleVariant(variant, gb, genotypeSamplePileup, minBaseQualityCutoff); + updatedGenotypes.add(gb.make()); + } + + vcfWriter.add(new VariantContextBuilder(variant).genotypes(updatedGenotypes).make()); + } + + @Override + public boolean requiresReads() { + return true; + } + + private static VCFHeader createVCFHeader(final VCFHeader inputVCFHeader, final String commandLine) { + Utils.nonNull(inputVCFHeader); + + // Setup header for output file + final Set headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder()); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F1R2_KEY)); + headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F2R1_KEY)); + headerLines.add(new VCFHeaderLine("command", commandLine)); + final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples()); + return new VCFHeader(headerLines, samples.asSetOfSamples()); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java index f2508827926..8a9ce4d7c12 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasFilterer.java @@ -14,7 +14,6 @@ import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList; import org.broadinstitute.hellbender.utils.genotyper.SampleList; import org.broadinstitute.hellbender.utils.param.ParamUtils; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; import java.util.*; @@ -67,7 +66,7 @@ public static VariantContext annotateVariantContextWithPreprocessingValues(final // Get the reference allele as a String and make sure that there is only one ref allele and that it is length // one, which would indicate that it could be a part of a SNP/SNV final List refAlleles = alleles.stream().filter(a -> a.isReference()).map(a -> a.getBaseString()).collect(Collectors.toList()); - if (((refAlleles.size() == 1) && (refAlleles.get(0).length() == 1))) { + if (((refAlleles.size() == 1) && (refAlleles.get(0).length() == 1)) && alleles.size() > 1) { final Character refAllele = (char) refAlleles.get(0).getBytes()[0]; // Since we only look at the first alt allele on a site, we do not need a for loop over all non-ref alleles, e.g. for (int i = 1; i < alleles.size(); i++) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasUtils.java index 487fa16a8dc..a15d951959b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/orientationbiasvariantfilter/OrientationBiasUtils.java @@ -268,6 +268,7 @@ public static long calculateUnfilteredNonRefGenotypeCount(final List !g.isFiltered()) + .filter(g -> g.getAlleles().size() > 1) .filter(g -> !g.getAllele(0).basesMatch(g.getAllele(1))) .count(); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OxoGReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OxoGReadCounts.java index 5c5882dad16..6b762608e8c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OxoGReadCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/OxoGReadCounts.java @@ -6,18 +6,23 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; import org.apache.commons.lang.mutable.MutableInt; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; import org.broadinstitute.hellbender.utils.QualityUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.hellbender.utils.pileup.PileupElement; +import org.broadinstitute.hellbender.utils.pileup.ReadPileup; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; -import java.util.Arrays; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; +import java.util.stream.IntStream; /** @@ -34,6 +39,8 @@ */ public final class OxoGReadCounts extends GenotypeAnnotation implements StandardMutectAnnotation { + private static final Logger logger = LogManager.getLogger(OxoGReadCounts.class); + @Override public List getKeyNames() { return Arrays.asList(GATKVCFConstants.F1R2_KEY, GATKVCFConstants.F2R1_KEY); @@ -77,6 +84,155 @@ public void annotate(final ReferenceContext refContext, gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1); } + /** + * Annotate the given variant context with the OxoG read count attributes, directly from the read pileup. + * + * This method may be slow and should be considered EXPERIMENTAL, especially with regard to indels and complex/mixed + * variants. + * + * @param vc variant context for the genotype. Necessary so that we can see all alleles. + * @param gb genotype builder to put the annotations into. + * @param readPileup pileup of the reads at this vc. Note that this pileup does not have to match the + * genotype. In other words, this tool does not check that the pileup was generated from the + * genotype sample. + */ + public static void annotateSingleVariant(final VariantContext vc, final GenotypeBuilder gb, + final ReadPileup readPileup, int meanBaseQualityCutoff) { + Utils.nonNull(gb, "gb is null"); + Utils.nonNull(vc, "vc is null"); + + // Create a list of unique alleles + final List variantAllelesWithDupes = vc.getAlleles(); + final Set alleleSet = new LinkedHashSet<>(variantAllelesWithDupes); + final List variantAlleles = new ArrayList<>(alleleSet); + + // Initialize the mappings + final Map f1r2Counts = variantAlleles.stream() + .collect(Collectors.toMap(a -> a, a -> new MutableInt(0))); + + final Map f2r1Counts = variantAlleles.stream() + .collect(Collectors.toMap(a -> a, a -> new MutableInt(0))); + + final List referenceAlleles = variantAlleles.stream().filter(a -> a.isReference() && !a.isSymbolic()).collect(Collectors.toList()); + final List altAlleles = variantAlleles.stream().filter(a -> a.isNonReference() && !a.isSymbolic()).collect(Collectors.toList()); + + if (referenceAlleles.size() != 1) { + logger.warn("Number of reference alleles does not equal for VC: " + vc); + } + + // We MUST have exactly 1 non-symbolic reference allele and a read pileup, + if ((referenceAlleles.size() == 1) && (readPileup != null) && !referenceAlleles.get(0).isSymbolic()) { + final Allele referenceAllele = referenceAlleles.get(0); + Utils.stream(readPileup) + .filter(pe -> isUsableRead(pe.getRead())) + .forEach(pe -> incrementCounts(pe, f1r2Counts, f2r1Counts, referenceAllele, altAlleles, meanBaseQualityCutoff)); + } + + final int[] f1r2 = variantAlleles.stream().mapToInt(a -> f1r2Counts.get(a).intValue()).toArray(); + + final int[] f2r1 = variantAlleles.stream().mapToInt(a -> f2r1Counts.get(a).intValue()).toArray(); + + gb.attribute(GATKVCFConstants.F1R2_KEY, f1r2); + gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1); + } + + /** + * If the allele is not in the count mappings, then it is not counted. No exception will be thrown + * Modifies count variables in place. + * + * @param pileupElement pileup overlapping the alleles + * @param f1r2Counts a mapping of allele to f1r2 counts + * @param f2r1Counts a mapping of allele to f2r1 counts + */ + private static void incrementCounts(final PileupElement pileupElement, final Map f1r2Counts, + final Map f2r1Counts, final Allele referenceAllele, + final List altAlleles, int minBaseQualityCutoff) { + + final Map countMap = isF2R1(pileupElement.getRead()) ? f2r1Counts : f1r2Counts; + + final boolean isRef = referenceAllele.basesMatch(getBasesForAlleleInRead(pileupElement, referenceAllele)) + && !pileupElement.isBeforeDeletionStart() && !pileupElement.isBeforeInsertion(); + + Allele pileupAllele = null; + if (!isRef) { + + for (Allele altAllele : altAlleles) { + final VariantContext.Type variantType = GATKProtectedVariantContextUtils.typeOfVariant(referenceAllele, altAllele); + + if (variantType == VariantContext.Type.INDEL) { + if (isIndelInThePileupElement(pileupElement, referenceAllele, altAllele)) { + pileupAllele = altAllele; + } + + } else if (variantType == VariantContext.Type.MNP || variantType == VariantContext.Type.SNP) { + if (altAllele.basesMatch(getBasesForAlleleInRead(pileupElement, altAllele))) { + pileupAllele = altAllele; + } + } + + } + + } else { + pileupAllele = referenceAllele; + } + + if (pileupAllele == null) { + return; + } + + if (getMinBaseQualityForAlleleInRead(pileupElement, pileupAllele) < minBaseQualityCutoff) { + return; + } + + if (countMap.containsKey(pileupAllele)) { + countMap.get(pileupAllele).increment(); + } + } + + private static boolean isIndelInThePileupElement(final PileupElement pileupElement, final Allele referenceAllele, final Allele altAllele) { + boolean isAltAlleleInThePileup = false; + + // Check insertion + if (pileupElement.isBeforeInsertion()) { + final int insertionLength = pileupElement.getLengthOfImmediatelyFollowingIndel(); + if (insertionLength == pileupElement.getLengthOfImmediatelyFollowingIndel()) { + final String insertionBases = pileupElement.getBasesOfImmediatelyFollowingInsertion(); + // edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB] + if (insertionBases != null) { + final boolean isMatch = Allele.extend(referenceAllele, insertionBases.getBytes()).basesMatch(altAllele); + if (isMatch) { + isAltAlleleInThePileup = true; + } + } + } + } + + // Check deletion + if (pileupElement.isBeforeDeletionStart()) { + final int deletionLength = pileupElement.getLengthOfImmediatelyFollowingIndel(); + if ((referenceAllele.getBases().length - altAllele.getBases().length) == deletionLength) { + isAltAlleleInThePileup = true; + } + } + return isAltAlleleInThePileup; + } + + private static byte[] getBasesForAlleleInRead(final PileupElement pileupElement, final Allele allele) { + return ArrayUtils.subarray(pileupElement.getRead().getBases(), pileupElement.getOffset(), pileupElement.getOffset() + allele.getBases().length); + } + + private static int getMinBaseQualityForAlleleInRead(final PileupElement pileupElement, final Allele allele) { + final byte[] alleleBases = allele.getBases(); + final byte[] pileupBaseQualities = ArrayUtils.subarray(pileupElement.getRead().getBaseQualities(), pileupElement.getOffset(), pileupElement.getOffset() + alleleBases.length); + final OptionalInt minQuality = IntStream.range(0, pileupBaseQualities.length).map(i -> Byte.toUnsignedInt(pileupBaseQualities[i])).min(); + if (!minQuality.isPresent()) { + return -1; + } else { + return minQuality.getAsInt(); + } + } + + protected static boolean isUsableRead(final GATKRead read) { return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java index 1c5442bf3a2..5e3d62cf0bb 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtils.java @@ -1,11 +1,11 @@ package org.broadinstitute.hellbender.utils; import htsjdk.samtools.util.Locatable; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFConstants; -import org.broadinstitute.hellbender.engine.ReadsContext; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine; @@ -14,7 +14,6 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import java.lang.reflect.Array; -import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.function.Function; @@ -22,6 +21,7 @@ import java.util.function.ToDoubleFunction; import java.util.function.ToIntFunction; import java.util.stream.Collectors; +import java.util.stream.IntStream; import java.util.stream.Stream; import java.util.stream.StreamSupport; @@ -290,4 +290,116 @@ public static ReadPileup getPileup(final Locatable loc, final Iterable return new ReadPileup(loc, pile); } + + /** This is lifted directly from htsjdk with some minor modifications! However, it is a private method there. + * + * This method cannot return {@link VariantContext.Type} MIXED + * + * Please see https://github.com/samtools/htsjdk/issues/999 + * + *

Here are some cases that will not work properly, though this may not be an issue in practice:

+ *
    + *
  • "CGT" --> "GGA" this will be a MNP, but really it is two SNPs.
  • + *
  • Spanning deletions for alternate will show as {@link VariantContext.Type} NO_VARIATION
  • + *
  • Spanning deletions for reference will throw exception.
  • + *
  • Reference that is symbolic will throw an exception.
  • + *
+ * + * @param ref reference allele. Never {@code null} + * @param allele alternate allele to compare. Never {@code null} + * @return + */ + public static VariantContext.Type typeOfVariant(final Allele ref, final Allele allele) { + Utils.nonNull(ref); + Utils.nonNull(allele); + + if ( ref.isSymbolic() ) + throw new IllegalStateException("Unexpected error: encountered a record with a symbolic reference allele"); + + if ( allele.isSymbolic() ) + return VariantContext.Type.SYMBOLIC; + + if (allele.equals(Allele.SPAN_DEL)) { + return VariantContext.Type.NO_VARIATION; + } + + if ( ref.equals(Allele.SPAN_DEL) ) + throw new IllegalStateException("Unexpected error: encountered a record with a spanning deletion reference allele"); + + if ( ref.length() == allele.length() ) { + if (ref.basesMatch(allele)) { + return VariantContext.Type.NO_VARIATION; + } else if ( allele.length() == 1 ) + return VariantContext.Type.SNP; + + // If the two alleles are the same length and only differ by one base, then still a SNP. + else if (IntStream.range(0, ref.length()).filter(i -> ref.getBases()[i] != allele.getBases()[i]).count() == 1) { + return VariantContext.Type.SNP; + } else + return VariantContext.Type.MNP; + } + + // Important note: previously we were checking that one allele is the prefix of the other. However, that's not an + // appropriate check as can be seen from the following example: + // REF = CTTA and ALT = C,CT,CA + // This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check. + // In truth, it should be absolutely impossible to return a MIXED type from this method because it simply + // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type + // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point + // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. + + return VariantContext.Type.INDEL; + + // old incorrect logic: + // if (oneIsPrefixOfOther(ref, allele)) + // return Type.INDEL; + // else + // return Type.MIXED; + } + + /** + * This method should only be run on variants that are known to be indels. See {@link GATKProtectedVariantContextUtils::typeOfVariant} + * + *

Here are some cases that will not work properly, though this may not be an issue in practice:

+ *
    + *
  • "CT" --> "CATT" this is really just a simple AT insertion, but this will show up as complex.
  • + *
+ * @param ref reference allele. Never {@code null} + * @param allele alternate allele to compare. Never {@code null} + * @return true if the indel is complex (for example, also includes a SNP), false if simple indel. If the input alleles define a variant that is not + * an indel, then the behavior of this method is undefined (though will probably just return false). + * + */ + public static boolean isComplexIndel(final Allele ref, final Allele allele) { + + Utils.nonNull(ref); + Utils.nonNull(allele); + + // Symbolic --> false + if (ref.isSymbolic() || (ref.length() == 0)) { + return false; + } + if (allele.isSymbolic() || (allele.length() == 0)) { + return false; + } + + // SNP, MNP, or no variation --> false + if ( ref.length() == allele.length() ) { + return false; + } + + // obvious simple del or simple indel + if ((allele.length() == 1) || (ref.length() == 1)) { + return false; + } + + // If the ref starts with the alt or vice versa, this is still simple. + if (allele.length() > ref.length()) { + final boolean isAltStartsWithRef = IntStream.range(0, ref.length()).allMatch(i -> ref.getBases()[i] == allele.getBases()[i]); + return !isAltStartsWithRef; + } else { + final boolean isRefStartsWithAlt = IntStream.range(0, allele.length()).allMatch(i -> ref.getBases()[i] == allele.getBases()[i]); + return !isRefStartsWithAlt; + } + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientationIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientationIntegrationTest.java new file mode 100644 index 00000000000..70e4af93f1c --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/AnnotatePairOrientationIntegrationTest.java @@ -0,0 +1,132 @@ +package org.broadinstitute.hellbender.tools; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureDataSource; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.nd4j.linalg.io.StringUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +public class AnnotatePairOrientationIntegrationTest extends CommandLineProgramTest { + private static final File TEST_VCF_DIR = new File("src/test/resources/org/broadinstitute/hellbender/tools/"); + private static final File TEST_BAM_DIR = new File("src/test/resources/large/mutect/dream_synthetic_bams/"); + + final static String TEST_VCF = TEST_VCF_DIR.getAbsolutePath() + "/test_no_pair_orientation_info.vcf"; + final static String TEST_VCF_INDELS = TEST_VCF_DIR.getAbsolutePath() + "/test_no_pair_orientation_info_indels.vcf"; + final static String TEST_BAM_TUMOR = TEST_BAM_DIR.getAbsolutePath() + "/tumor_1.bam"; + final static String TEST_BAM_NORMAL = TEST_BAM_DIR.getAbsolutePath() + "/normal_1.bam"; + final static String TEST_BAM_TUMOR_INDELS = TEST_BAM_DIR.getAbsolutePath() + "/tumor_3.bam"; + final static String TEST_BAM_NORMAL_INDELS = TEST_BAM_DIR.getAbsolutePath() + "/normal_3.bam"; + + // TODO: Test with multiallelics + // TODO: Test with multiallelics and symbolic at the same time + // TODO: Test with symbolic + // TODO: Test with information missing from the VCF and make sure appropriate exception is thrown. + // TODO: Test with more cutoff variables + // TODO: Once above five TODOs are done (at least), AnnotatePairOrientation can be taken out of Experimental status. + + + @Test + public void testBasicIndels() throws IOException { + final File outputFile = File.createTempFile("ob_indel_annotate_", ".vcf"); + final List arguments = new ArrayList<>(); + arguments.add("-" + StandardArgumentDefinitions.VARIANT_SHORT_NAME); + arguments.add(TEST_VCF_INDELS); + arguments.add("-" + StandardArgumentDefinitions.INPUT_SHORT_NAME); + arguments.add(TEST_BAM_TUMOR_INDELS); + arguments.add("-" + StandardArgumentDefinitions.INPUT_SHORT_NAME); + arguments.add(TEST_BAM_NORMAL_INDELS); + + + arguments.add("-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME); + arguments.add(outputFile.getAbsolutePath()); + runCommandLine(arguments); + + // Ground truth from manual review in IGV + final String[][] gtF1R2F2R1 = {{"14,0", "21,0", "11,3", "13,4"},{"34,0","27,0", "10,12","14,11"}, + {"14,0", "14,0", "18,1", "17,3"},{"24,0","15,0", "19,7","22,2"}}; + + Assert.assertTrue(outputFile.exists()); + final List variantContexts = getVariantContextsFromFile(outputFile); + + assertOrientationAnnotationValues(variantContexts, gtF1R2F2R1, "G15512.prenormal.sorted", + "IS3.snv.indel.sv"); + } + + /** + * Only tests SNVs + * @throws IOException + */ + @Test + public void testBasicRun() throws IOException{ + final File outputFile = File.createTempFile("ob_annotate_", ".vcf"); + final List arguments = new ArrayList<>(); + arguments.add("-" + StandardArgumentDefinitions.VARIANT_SHORT_NAME); + arguments.add(TEST_VCF); + arguments.add("-" + StandardArgumentDefinitions.INPUT_SHORT_NAME); + arguments.add(TEST_BAM_TUMOR); + arguments.add("-" + StandardArgumentDefinitions.INPUT_SHORT_NAME); + arguments.add(TEST_BAM_NORMAL); + + arguments.add("-" + StandardArgumentDefinitions.OUTPUT_SHORT_NAME); + arguments.add(outputFile.getAbsolutePath()); + runCommandLine(arguments); + + Assert.assertTrue(outputFile.exists()); + final List variantContexts = getVariantContextsFromFile(outputFile); + + // Ground truth from manual review in IGV + final String[][] gtF1R2F2R1 = {{"22,0", "11,0", "9,9", "8,4"},{"11,0","15,0", "11,8","9,10"}}; + + assertOrientationAnnotationValues(variantContexts, gtF1R2F2R1, "synthetic.challenge.set1.normal", + "synthetic.challenge.set1.tumor"); + } + + private List getVariantContextsFromFile(File vcfFile) { + final List variantContexts = new ArrayList<>(); + final FeatureDataSource featureDataSource = new FeatureDataSource<>(vcfFile); + for (final VariantContext vc : featureDataSource) { + variantContexts.add(vc); + } + return variantContexts; + } + + private void assertOrientationAnnotationValues(final List variantContexts, final String[][] gtF1R2F2R1, + final String normalSampleName, final String tumorSampleName) { + final List annotations = new ArrayList<>(); + annotations.add(GATKVCFConstants.F1R2_KEY); + annotations.add(GATKVCFConstants.F2R1_KEY); + + for (int i = 0; i < variantContexts.size(); i++) { + final VariantContext vc = variantContexts.get(i); + final Genotype normalGenotype = vc.getGenotype(normalSampleName); + Assert.assertTrue(normalGenotype.hasExtendedAttribute(GATKVCFConstants.F1R2_KEY)); + Assert.assertTrue(normalGenotype.hasExtendedAttribute(GATKVCFConstants.F2R1_KEY)); + + for (int j = 0; j < annotations.size(); j ++) { + final String annotation = annotations.get(j); + final String normalF1r2 = normalGenotype.getExtendedAttribute(annotation).toString(); + Assert.assertEquals(normalF1r2, gtF1R2F2R1[i][j]); + } + + final Genotype tumorGenotype = vc.getGenotype(tumorSampleName); + Assert.assertTrue(tumorGenotype.hasExtendedAttribute(GATKVCFConstants.F1R2_KEY)); + Assert.assertTrue(tumorGenotype.hasExtendedAttribute(GATKVCFConstants.F2R1_KEY)); + for (int j = 0; j < annotations.size(); j ++) { + final String annotation = annotations.get(j); + final String tumorF1r2 = tumorGenotype.getExtendedAttribute(annotation).toString(); + Assert.assertNotNull(StringUtils.split(tumorF1r2, ",")); + Assert.assertEquals(StringUtils.split(tumorF1r2, ",").length, 2); + Assert.assertEquals(tumorF1r2, gtF1R2F2R1[i][j+annotations.size()]); + } + } + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtilsUnitTest.java index b37ba5698e6..307a5c6b8ed 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/GATKProtectedVariantContextUtilsUnitTest.java @@ -2,18 +2,18 @@ import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.util.Locatable; -import org.broadinstitute.hellbender.engine.ReadsContext; -import org.broadinstitute.hellbender.engine.ReadsDataSource; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; import org.broadinstitute.hellbender.utils.pileup.ReadPileup; import org.broadinstitute.hellbender.utils.read.ArtificialReadUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.Arrays; -import static org.testng.Assert.*; - /** * Created by David Benjamin on 2/15/17. */ @@ -55,4 +55,45 @@ public void testGetPileup() { Assert.assertEquals(counts, new int[]{1, 1, 0, 0}); } + + @Test(dataProvider = "variantTypes") + public void testVariantTypesAndIsComplex(final String ref, final String alt, final VariantContext.Type gtType, boolean isComplexIndel) { + Assert.assertEquals(GATKProtectedVariantContextUtils.typeOfVariant(Allele.create(ref), Allele.create(alt)), gtType); + Assert.assertEquals(GATKProtectedVariantContextUtils.isComplexIndel(Allele.create(ref), Allele.create(alt)), isComplexIndel); + } + @Test(expectedExceptions = IllegalStateException.class) + public void testSymbolicRef() { + GATKProtectedVariantContextUtils.typeOfVariant(GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE, Allele.create("C")); + } + + @DataProvider(name = "variantTypes") + public Object[][] variantTypes() { + return new Object[][]{ + // ref, alt, type, isComplex? + {"CCTTGGCTTATTCCA", "C", VariantContext.Type.INDEL, false}, + {"C", "CCTTGGCTTATTCCA", VariantContext.Type.INDEL, false}, + {"ACTAG", "A", VariantContext.Type.INDEL, false}, + {"ATT", "AT", VariantContext.Type.INDEL, false}, + {"AT", "ATT", VariantContext.Type.INDEL, false}, + {"CT", "CAGG", VariantContext.Type.INDEL, true}, + {"CTTT", "CAGG", VariantContext.Type.MNP, false}, + {"CTTT", "CAGGG", VariantContext.Type.INDEL, true}, + {"T", "T", VariantContext.Type.NO_VARIATION, false}, + {"CTAG", "CTAG", VariantContext.Type.NO_VARIATION, false}, + {"A", "AAGAAGCATGC", VariantContext.Type.INDEL, false}, + {"A", "C", VariantContext.Type.SNP, false}, + {"AG", "CA", VariantContext.Type.MNP, false}, + {"AGAAGG", "CATTCC", VariantContext.Type.MNP, false}, + {"GC", "GA", VariantContext.Type.SNP, false}, + {"GA", "", VariantContext.Type.SYMBOLIC, false}, + {"GA", "*", VariantContext.Type.NO_VARIATION, false}, + + // There are two MNPs here + {"AGAAGG", "CATACC", VariantContext.Type.MNP, false}, + + // Note that this is technically a simple AT insertion, but the isComplex cannot handle this properly. + {"CT", "CATT", VariantContext.Type.INDEL, true}, + }; + } + } \ No newline at end of file diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info.vcf new file mode 100644 index 00000000000..f08787da5a7 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info.vcf @@ -0,0 +1,64 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##Mutect Version=2.1-beta +##command=FilterByOrientationBias --output synthetic.challenge.set1.tumor-vs-synthetic.challenge.set1.normal-filtered.vcf --preAdapterDetailFile /cromwell-executions/Mutect2_Multi/9bd645eb-e2c9-41f6-88d2-a61013b9e65c/call-Mutect2/shard-0/Mutect2/ead27398-13e8-48fc-9050-67633fe2d6d2/call-Filter/inputs/home/lichtens/test_bamout/cromwell-executions/Mutect2_Multi/9bd645eb-e2c9-41f6-88d2-a61013b9e65c/call-Mutect2/shard-0/Mutect2/ead27398-13e8-48fc-9050-67633fe2d6d2/call-CollectSequencingArtifactMetrics/execution/gatk.pre_adapter_detail_metrics --artifactModes G/T --artifactModes C/T --variant filtered.vcf --interval_set_rule UNION --interval_padding 0 --interval_exclusion_padding 0 --interval_merging_rule ALL --readValidationStringency SILENT --secondsBetweenProgressUpdates 10.0 --disableSequenceDictionaryValidation false --createOutputBamIndex true --createOutputBamMD5 false --createOutputVariantIndex true --createOutputVariantMD5 false --lenient false --addOutputSAMProgramRecord true --addOutputVCFCommandLine true --cloudPrefetchBuffer 40 --cloudIndexPrefetchBuffer -1 --disableBamIndexCaching false --help false --version false --showHidden false --verbosity INFO --QUIET false --use_jdk_deflater false --use_jdk_inflater false --gcs_max_retries 20 --disableToolDefaultReadFilters false +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=synthetic.challenge.set1.normal +##orientation_bias_artifact_modes= +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=synthetic.challenge.set1.tumor +##Manually removed the F1R2 and F2R1 from FORMAT and header +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT synthetic.challenge.set1.normal synthetic.challenge.set1.tumor +20 577548 . C G . PASS DP=55;ECNT=1;NLOD=8.72;N_ART_LOD=-1.483e+00;POP_AF=1.000e-03;P_GERMLINE=-5.423e+00;TLOD=25.47 GT:AD:AF:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:OBF:OBP:OBQ:OBQRC:SA_MAP_AF:SA_POST_PROB 0/0:29,0:0.017:0:339,0:0:0:false:false 0/1:14,10:0.420:30:314,325:60:20:false:false:.:.:45.99:100.00:0.394,0.384,0.417:0.021,0.024,0.955 +20 1838610 . T A . PASS DP=64;ECNT=1;NLOD=7.82;N_ART_LOD=-1.438e+00;POP_AF=1.000e-03;P_GERMLINE=-4.521e+00;TLOD=50.09 GT:AD:AF:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:OBF:OBP:OBQ:OBQRC:SA_MAP_AF:SA_POST_PROB 0/0:26,0:0.019:0:338,0:0:0:false:false 0/1:20,17:0.460:32:336,340:60:20:false:false:.:.:100.00:49.80:0.444,0.404,0.459:0.017,0.032,0.951 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info_indels.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info_indels.vcf new file mode 100644 index 00000000000..64483991374 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/test_no_pair_orientation_info_indels.vcf @@ -0,0 +1,67 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##Mutect Version=2.1-beta +##command=FilterByOrientationBias --output IS3.snv.indel.sv-vs-G15512.prenormal.sorted-filtered.vcf --preAdapterDetailFile /cromwell-executions/Mutect2_Multi/91355e3c-342f-4c9d-8ea6-e70a61faca94/call-Mutect2/shard-2/Mutect2/a727615d-d7b1-4f3b-8ed0-48f4c1920eca/call-Filter/inputs/home/lichtens/test_bamout/cromwell-executions/Mutect2_Multi/91355e3c-342f-4c9d-8ea6-e70a61faca94/call-Mutect2/shard-2/Mutect2/a727615d-d7b1-4f3b-8ed0-48f4c1920eca/call-CollectSequencingArtifactMetrics/execution/gatk.pre_adapter_detail_metrics --artifactModes G/T --artifactModes C/T --variant filtered.vcf --interval_set_rule UNION --interval_padding 0 --interval_exclusion_padding 0 --interval_merging_rule ALL --readValidationStringency SILENT --secondsBetweenProgressUpdates 10.0 --disableSequenceDictionaryValidation false --createOutputBamIndex true --createOutputBamMD5 false --createOutputVariantIndex true --createOutputVariantMD5 false --lenient false --addOutputSAMProgramRecord true --addOutputVCFCommandLine true --cloudPrefetchBuffer 40 --cloudIndexPrefetchBuffer -1 --disableBamIndexCaching false --help false --version false --showHidden false --verbosity INFO --QUIET false --use_jdk_deflater false --use_jdk_inflater false --gcs_max_retries 20 --disableToolDefaultReadFilters false +##contig= +##contig= +##filtering_status=These calls have been filtered by FilterMutectCalls to label false positives with a list of failed filters and true positives with PASS. +##normal_sample=G15512.prenormal.sorted +##orientation_bias_artifact_modes= +##source=FilterMutectCalls +##source=Mutect2 +##tumor_sample=IS3.snv.indel.sv +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT G15512.prenormal.sorted IS3.snv.indel.sv +20 1330646 . CCTTGGCTTATTCCA C . PASS DP=85;ECNT=1;NLOD=14.42;N_ART_LOD=-1.693e+00;POP_AF=1.000e-03;P_GERMLINE=-1.112e+01;TLOD=35.03 GT:AD:AF:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:SA_MAP_AF:SA_POST_PROB 0/0:48,0:0.011:19,0:29,0:0:348,0:0:0:false:false 0/1:26,10:0.278:11,5:15,5:29:349,348:60:20:false:false:0.253,0.253,0.278:0.017,0.021,0.962 +20 2038732 . ACTAG A . PASS DP=123;ECNT=1;NLOD=19.82;N_ART_LOD=-1.842e+00;POP_AF=1.000e-03;P_GERMLINE=-1.652e+01;TLOD=96.69 GT:AD:AF:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:SA_MAP_AF:SA_POST_PROB 0/0:41,0:0.043:21,0:20,0:0:345,0:0:0:false:false 0/1:35,8:0.193:23,3:12,5:29:345,338:60:33:false:false:0.172,0.141,0.186:9.465e-03,0.036,0.955 +20 3076247 . AT ATT . t_lod DP=72;ECNT=2;NLOD=7.02;N_ART_LOD=-1.161e+00;POP_AF=1.000e-03;P_GERMLINE=-5.885e+00;RPA=14,15;RU=T;STR;TLOD=3.83 GT:AD:AF:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:SA_MAP_AF:SA_POST_PROB 0/0:24,0:0.140:13,0,0:9,2,0:0:342,0:0:0:false:false 0/1:25,4:0.243:13,1,0:10,1,4:30:341,334:60:24:false:false:0.121,0.111,0.138:0.014,0.019,0.967 +20 3076299 . A AAGAAGCATGC . PASS DP=90;ECNT=2;NLOD=10.86;N_ART_LOD=-1.579e+00;POP_AF=1.000e-03;P_GERMLINE=-7.558e+00;TLOD=41.31 GT:AD:AF:F1R2:F2R1:MBQ:MFRL:MMQ:MPOS:OBAM:OBAMRC:SA_MAP_AF:SA_POST_PROB 0/0:36,0:0.026:22,0:14,0:0:340,0:0:0:false:false 0/1:37,12:0.259:17,8:20,4:29:343,342:60:31:false:false:0.192,0.232,0.245:0.035,0.011,0.955