Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding tool to annotate with pair orientation info #3614

Merged
merged 1 commit into from
Sep 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/mutect2_wdl/unsupported/mutect2_opt.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ workflow Mutect2 {
# select_first() fails if nothing resolves to non-null, so putting in "null" for now.
File? oncotated_m2_maf = select_first([oncotate_m2.oncotated_m2_maf, "null"])
File? preadapter_detail_metrics = select_first([CollectSequencingArtifactMetrics.pre_adapter_metrics, "null"])
File? bamout = select_first([MergeBamOuts.merged_bam_out, "null"])
File? bamout_index = select_first([MergeBamOuts.merged_bam_out_index, "null"])
File bamout = select_first([MergeBamOuts.merged_bam_out, "null"])
File bamout_index = select_first([MergeBamOuts.merged_bam_out_index, "null"])
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package org.broadinstitute.hellbender.tools;

import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantWalker;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.tools.walkers.annotator.OxoGReadCounts;
import org.broadinstitute.hellbender.tools.walkers.mutect.Mutect2Engine;
import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList;
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.io.File;
import java.util.*;

@CommandLineProgramProperties(
summary = "(Experimental) This adds fields normally emitted by M2 to a VCF. There should never be a need to run this tool on a VCF that was produced by M2." +
"\n The output of this tool should be usable with FilterByOrientationBias." +
"\n The output of this tool only counts reads that fully overlap (and match) the variant or reference sequence (this is relevant for indels)." +
"\n IMPORTANT: This tool does not produce the exact same F1R2/F2R1 as M2, due to the nature of how M2 calls variants (using read likelihoods, whereas this tool uses a base quality filter).",
oneLineSummary = "(EXPERIMENTAL) Annotate a non-M2 VCF (using the associated tumor bam) with pair orientation fields (e.g. " + GATKVCFConstants.F1R2_KEY + " ).",
programGroup = VariantProgramGroup.class
)
@BetaFeature
public class AnnotatePairOrientation extends VariantWalker {

@Argument(
doc = "Output Somatic SNP/Indel VCF file with additional annotations.",
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME)
protected File outputFile;

public final static String CUTOFF_SHORT_NAME = "cutoff";
public final static String CUTOFF_LONG_NAME = "min-base-quality-cutoff";
public final static int MIN_BASE_QUALITY_DEFAULT_CUTOFF = 7;
@Argument(
doc = "Cutoff for the min base quality value(s) to count the read. These are for bases that overlap the variant.",
shortName = CUTOFF_SHORT_NAME, fullName = CUTOFF_LONG_NAME, minValue = 0, maxRecommendedValue = 20,
optional = true
)
private int minBaseQualityCutoff = MIN_BASE_QUALITY_DEFAULT_CUTOFF;

private VariantContextWriter vcfWriter;

@Override
public void onTraversalStart() {
vcfWriter = createVCFWriter(outputFile);
vcfWriter.writeHeader(createVCFHeader(getHeaderForVariants(), getCommandLine()));
}

@Override
public List<ReadFilter> getDefaultReadFilters() {
return Mutect2Engine.makeStandardMutect2ReadFilters();
}

@Override
public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extra whitespace line

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

final ReadPileup readPileup = GATKProtectedVariantContextUtils.getPileup(variant, readsContext);
final List<Genotype> updatedGenotypes = new ArrayList<>();

final Map<String, ReadPileup> sampleToReadPileup = readPileup.splitBySample(getHeaderForReads(), null);

for (Genotype g : variant.getGenotypes()) {
final ReadPileup genotypeSamplePileup = sampleToReadPileup.get(g.getSampleName());
final GenotypeBuilder gb = new GenotypeBuilder(g);
OxoGReadCounts.annotateSingleVariant(variant, gb, genotypeSamplePileup, minBaseQualityCutoff);
updatedGenotypes.add(gb.make());
}

vcfWriter.add(new VariantContextBuilder(variant).genotypes(updatedGenotypes).make());
}

@Override
public boolean requiresReads() {
return true;
}

private static VCFHeader createVCFHeader(final VCFHeader inputVCFHeader, final String commandLine) {
Utils.nonNull(inputVCFHeader);

// Setup header for output file
final Set<VCFHeaderLine> headerLines = new LinkedHashSet<>(inputVCFHeader.getMetaDataInInputOrder());
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F1R2_KEY));
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.F2R1_KEY));
headerLines.add(new VCFHeaderLine("command", commandLine));
final SampleList samples = new IndexedSampleList(inputVCFHeader.getGenotypeSamples());
return new VCFHeader(headerLines, samples.asSetOfSamples());
}

@Override
public void closeTool() {
if (vcfWriter != null) {
vcfWriter.close();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import org.broadinstitute.hellbender.utils.genotyper.IndexedSampleList;
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
import org.broadinstitute.hellbender.utils.param.ParamUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

import java.util.*;
Expand Down Expand Up @@ -67,7 +66,7 @@ public static VariantContext annotateVariantContextWithPreprocessingValues(final
// Get the reference allele as a String and make sure that there is only one ref allele and that it is length
// one, which would indicate that it could be a part of a SNP/SNV
final List<String> refAlleles = alleles.stream().filter(a -> a.isReference()).map(a -> a.getBaseString()).collect(Collectors.toList());
if (((refAlleles.size() == 1) && (refAlleles.get(0).length() == 1))) {
if (((refAlleles.size() == 1) && (refAlleles.get(0).length() == 1)) && alleles.size() > 1) {
final Character refAllele = (char) refAlleles.get(0).getBytes()[0];

// Since we only look at the first alt allele on a site, we do not need a for loop over all non-ref alleles, e.g. for (int i = 1; i < alleles.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ public static long calculateUnfilteredNonRefGenotypeCount(final List<VariantCont

return getGenotypeStream(sampleName, variants)
.filter(g -> !g.isFiltered())
.filter(g -> g.getAlleles().size() > 1)
.filter(g -> !g.getAllele(0).basesMatch(g.getAllele(1)))
.count();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,23 @@
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFormatHeaderLine;
import org.apache.commons.lang.mutable.MutableInt;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils;
import org.broadinstitute.hellbender.utils.QualityUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;


/**
Expand All @@ -34,6 +39,8 @@
*/
public final class OxoGReadCounts extends GenotypeAnnotation implements StandardMutectAnnotation {

private static final Logger logger = LogManager.getLogger(OxoGReadCounts.class);

@Override
public List<String> getKeyNames() {
return Arrays.asList(GATKVCFConstants.F1R2_KEY, GATKVCFConstants.F2R1_KEY);
Expand Down Expand Up @@ -77,6 +84,155 @@ public void annotate(final ReferenceContext refContext,
gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1);
}

/**
* Annotate the given variant context with the OxoG read count attributes, directly from the read pileup.
*
* This method may be slow and should be considered EXPERIMENTAL, especially with regard to indels and complex/mixed
* variants.
*
* @param vc variant context for the genotype. Necessary so that we can see all alleles.
* @param gb genotype builder to put the annotations into.
* @param readPileup pileup of the reads at this vc. Note that this pileup does not have to match the
* genotype. In other words, this tool does not check that the pileup was generated from the
* genotype sample.
*/
public static void annotateSingleVariant(final VariantContext vc, final GenotypeBuilder gb,
final ReadPileup readPileup, int meanBaseQualityCutoff) {
Utils.nonNull(gb, "gb is null");
Utils.nonNull(vc, "vc is null");

// Create a list of unique alleles
final List<Allele> variantAllelesWithDupes = vc.getAlleles();
final Set<Allele> alleleSet = new LinkedHashSet<>(variantAllelesWithDupes);
final List<Allele> variantAlleles = new ArrayList<>(alleleSet);

// Initialize the mappings
final Map<Allele, MutableInt> f1r2Counts = variantAlleles.stream()
.collect(Collectors.toMap(a -> a, a -> new MutableInt(0)));

final Map<Allele, MutableInt> f2r1Counts = variantAlleles.stream()
.collect(Collectors.toMap(a -> a, a -> new MutableInt(0)));

final List<Allele> referenceAlleles = variantAlleles.stream().filter(a -> a.isReference() && !a.isSymbolic()).collect(Collectors.toList());
final List<Allele> altAlleles = variantAlleles.stream().filter(a -> a.isNonReference() && !a.isSymbolic()).collect(Collectors.toList());

if (referenceAlleles.size() != 1) {
logger.warn("Number of reference alleles does not equal for VC: " + vc);
}

// We MUST have exactly 1 non-symbolic reference allele and a read pileup,
if ((referenceAlleles.size() == 1) && (readPileup != null) && !referenceAlleles.get(0).isSymbolic()) {
final Allele referenceAllele = referenceAlleles.get(0);
Utils.stream(readPileup)
.filter(pe -> isUsableRead(pe.getRead()))
.forEach(pe -> incrementCounts(pe, f1r2Counts, f2r1Counts, referenceAllele, altAlleles, meanBaseQualityCutoff));
}

final int[] f1r2 = variantAlleles.stream().mapToInt(a -> f1r2Counts.get(a).intValue()).toArray();

final int[] f2r1 = variantAlleles.stream().mapToInt(a -> f2r1Counts.get(a).intValue()).toArray();

gb.attribute(GATKVCFConstants.F1R2_KEY, f1r2);
gb.attribute(GATKVCFConstants.F2R1_KEY, f2r1);
}

/**
* If the allele is not in the count mappings, then it is not counted. No exception will be thrown
* Modifies count variables in place.
*
* @param pileupElement pileup overlapping the alleles
* @param f1r2Counts a mapping of allele to f1r2 counts
* @param f2r1Counts a mapping of allele to f2r1 counts
*/
private static void incrementCounts(final PileupElement pileupElement, final Map<Allele, MutableInt> f1r2Counts,
final Map<Allele, MutableInt> f2r1Counts, final Allele referenceAllele,
final List<Allele> altAlleles, int minBaseQualityCutoff) {

final Map<Allele, MutableInt> countMap = isF2R1(pileupElement.getRead()) ? f2r1Counts : f1r2Counts;

final boolean isRef = referenceAllele.basesMatch(getBasesForAlleleInRead(pileupElement, referenceAllele))
&& !pileupElement.isBeforeDeletionStart() && !pileupElement.isBeforeInsertion();

Allele pileupAllele = null;
if (!isRef) {

for (Allele altAllele : altAlleles) {
final VariantContext.Type variantType = GATKProtectedVariantContextUtils.typeOfVariant(referenceAllele, altAllele);

if (variantType == VariantContext.Type.INDEL) {
if (isIndelInThePileupElement(pileupElement, referenceAllele, altAllele)) {
pileupAllele = altAllele;
}

} else if (variantType == VariantContext.Type.MNP || variantType == VariantContext.Type.SNP) {
if (altAllele.basesMatch(getBasesForAlleleInRead(pileupElement, altAllele))) {
pileupAllele = altAllele;
}
}

}

} else {
pileupAllele = referenceAllele;
}

if (pileupAllele == null) {
return;
}

if (getMinBaseQualityForAlleleInRead(pileupElement, pileupAllele) < minBaseQualityCutoff) {
return;
}

if (countMap.containsKey(pileupAllele)) {
countMap.get(pileupAllele).increment();
}
}

private static boolean isIndelInThePileupElement(final PileupElement pileupElement, final Allele referenceAllele, final Allele altAllele) {
boolean isAltAlleleInThePileup = false;

// Check insertion
if (pileupElement.isBeforeInsertion()) {
final int insertionLength = pileupElement.getLengthOfImmediatelyFollowingIndel();
if (insertionLength == pileupElement.getLengthOfImmediatelyFollowingIndel()) {
final String insertionBases = pileupElement.getBasesOfImmediatelyFollowingInsertion();
// edge case: ignore a deletion immediately preceding an insertion as p.getBasesOfImmediatelyFollowingInsertion() returns null [EB]
if (insertionBases != null) {
final boolean isMatch = Allele.extend(referenceAllele, insertionBases.getBytes()).basesMatch(altAllele);
if (isMatch) {
isAltAlleleInThePileup = true;
}
}
}
}

// Check deletion
if (pileupElement.isBeforeDeletionStart()) {
final int deletionLength = pileupElement.getLengthOfImmediatelyFollowingIndel();
if ((referenceAllele.getBases().length - altAllele.getBases().length) == deletionLength) {
isAltAlleleInThePileup = true;
}
}
return isAltAlleleInThePileup;
}

private static byte[] getBasesForAlleleInRead(final PileupElement pileupElement, final Allele allele) {
return ArrayUtils.subarray(pileupElement.getRead().getBases(), pileupElement.getOffset(), pileupElement.getOffset() + allele.getBases().length);
}

private static int getMinBaseQualityForAlleleInRead(final PileupElement pileupElement, final Allele allele) {
final byte[] alleleBases = allele.getBases();
final byte[] pileupBaseQualities = ArrayUtils.subarray(pileupElement.getRead().getBaseQualities(), pileupElement.getOffset(), pileupElement.getOffset() + alleleBases.length);
final OptionalInt minQuality = IntStream.range(0, pileupBaseQualities.length).map(i -> Byte.toUnsignedInt(pileupBaseQualities[i])).min();
if (!minQuality.isPresent()) {
return -1;
} else {
return minQuality.getAsInt();
}
}


protected static boolean isUsableRead(final GATKRead read) {
return read.getMappingQuality() != 0 && read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE;
}
Expand Down
Loading