diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
index a720f6d47f8..dc98d99072e 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
@@ -170,11 +170,11 @@
* ...
* -A annotation_N \
* --mode SNP \
- * --resource snp-training,training=true snp-training.vcf \
- * --resource snp-calibration,calibration=true snp-calibration.vcf \
+ * --resource:snp-training,training=true snp-training.vcf \
+ * --resource:snp-calibration,calibration=true snp-calibration.vcf \
* --mode INDEL \
- * --resource indel-training,training=true indel-training.vcf \
- * --resource indel-calibration,calibration=true indel-calibration.vcf \
+ * --resource:indel-training,training=true indel-training.vcf \
+ * --resource:indel-calibration,calibration=true indel-calibration.vcf \
* -O extract
*
*
@@ -195,11 +195,11 @@
* ...
* -A annotation_N \
* --mode SNP \
- * --resource snp-training,training=true snp-training.vcf \
- * --resource snp-calibration,calibration=true snp-calibration.vcf \
+ * --resource:snp-training,training=true snp-training.vcf \
+ * --resource:snp-calibration,calibration=true snp-calibration.vcf \
* --mode INDEL \
- * --resource indel-training,training=true indel-training.vcf \
- * --resource indel-calibration,calibration=true indel-calibration.vcf \
+ * --resource:indel-training,training=true indel-training.vcf \
+ * --resource:indel-calibration,calibration=true indel-calibration.vcf \
* --maximum-number-of-unlableled-variants 1000000
* -O extract
*
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
index 153a3fd31a5..e1ebf3ce608 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable;
+import com.google.common.collect.Sets;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
@@ -17,6 +18,7 @@
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.MultiplePassVariantWalker;
+import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData;
@@ -46,7 +48,7 @@
* walker, performing the operations:
*
* - nthPassApply(n = 0)
- * - if variant/alleles pass filters and variant-type/overlapping-resource checks, then:
+ * - if variant/alleles pass filters and variant-type/resource-match checks, then:
* - add variant/alleles to a {@link LabeledVariantAnnotationsData} collection
* - write variant/alleles with labels appended to a sites-only VCF file
* - afterNthPass(n = 0)
@@ -89,6 +91,7 @@ public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVarian
public static final String IGNORE_FILTER_LONG_NAME = "ignore-filter";
public static final String IGNORE_ALL_FILTERS_LONG_NAME = "ignore-all-filters";
public static final String DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME = "do-not-trust-all-polymorphic";
+ public static final String RESOURCE_MATCHING_STRATEGY_LONG_NAME = "resource-matching-strategy";
public static final String OMIT_ALLELES_IN_HDF5_LONG_NAME = "omit-alleles-in-hdf5";
public static final String DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME = "do-not-gzip-vcf-output";
@@ -96,6 +99,10 @@ public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVarian
public static final String RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING = "This site was labeled as %s according to resources";
+ enum ResourceMatchingStrategy {
+ START_POSITION, START_POSITION_AND_GIVEN_REPRESENTATION, START_POSITION_AND_MINIMAL_REPRESENTATION
+ }
+
@Argument(
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
@@ -144,10 +151,24 @@ public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVarian
@Argument(
fullName = DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME,
doc = "If true, do not trust that unfiltered records in the resources contain only polymorphic sites. " +
- "This may increase runtime.",
+ "This may increase runtime if the resources are not sites-only VCFs.",
optional = true)
private boolean doNotTrustAllPolymorphic = false;
+
+ @Argument(
+ fullName = RESOURCE_MATCHING_STRATEGY_LONG_NAME,
+ doc = "The strategy to use for determining whether an input variant is present in a resource " +
+ "in non-allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " false). " +
+ "START_POSITION: Start positions of input and resource variants must match. " +
+ "START_POSITION_AND_GIVEN_REPRESENTATION: The intersection of the sets of input and resource alleles " +
+ "(in their given representations) must also be non-empty. " +
+ "START_POSITION_AND_MINIMAL_REPRESENTATION: The intersection of the sets of input and resource alleles " +
+ "(after converting alleles to their minimal representations) must also be non-empty. " +
+ "This argument has no effect in allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " true), " +
+ "in which the minimal representations of the input and resource alleles must match.",
+ optional = true)
+ private ResourceMatchingStrategy resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION;
@Argument(
fullName = OMIT_ALLELES_IN_HDF5_LONG_NAME,
doc = "If true, omit alleles in output HDF5 files in order to decrease file sizes.",
@@ -283,7 +304,9 @@ VCFHeader constructVCFHeader(final List sortedLabels) {
.collect(Collectors.toCollection(TreeSet::new));
hInfo.add(GATKVCFHeaderLines.getFilterLine(VCFConstants.PASSES_FILTERS_v4));
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
- hInfo = VcfUtils.updateHeaderContigLines(hInfo, null, sequenceDictionary, true);
+ if (sequenceDictionary != null) {
+ hInfo = VcfUtils.updateHeaderContigLines(hInfo, referenceArguments.getReferencePath(), sequenceDictionary, true);
+ }
hInfo.addAll(getDefaultToolVCFHeaderLines());
return new VCFHeader(hInfo);
}
@@ -303,57 +326,72 @@ final List, VariantType, TreeSet>> extractVariantMet
}
if (!useASAnnotations) {
// in non-allele-specific mode, get a singleton list of the triple
- // (list of alt alleles passing variant-type and overlapping-resource checks, variant type, set of labels)
+ // (list of alt alleles passing variant-type and resource-match checks, variant type, set of labels)
final VariantType variantType = VariantType.getVariantType(vc);
if (variantTypesToExtract.contains(variantType)) {
- final TreeSet overlappingResourceLabels = findOverlappingResourceLabels(vc, null, null, featureContext);
- if (isExtractUnlabeled || !overlappingResourceLabels.isEmpty()) {
- return Collections.singletonList(Triple.of(vc.getAlternateAlleles(), variantType, overlappingResourceLabels));
+ final TreeSet matchingResourceLabels = findMatchingResourceLabels(vc, null, featureContext);
+ if (isExtractUnlabeled || !matchingResourceLabels.isEmpty()) {
+ return Collections.singletonList(Triple.of(vc.getAlternateAlleles(), variantType, matchingResourceLabels));
}
}
} else {
// in allele-specific mode, get a list containing the triples
// (singleton list of alt allele, variant type, set of labels)
- // corresponding to alt alleles that pass variant-type and overlapping-resource checks
+ // corresponding to alt alleles that pass variant-type and resource-match checks
return vc.getAlternateAlleles().stream()
.filter(a -> !GATKVCFConstants.isSpanningDeletion(a))
.filter(a -> variantTypesToExtract.contains(VariantType.getAlleleSpecificVariantType(vc, a)))
.map(a -> Triple.of(Collections.singletonList(a), VariantType.getAlleleSpecificVariantType(vc, a),
- findOverlappingResourceLabels(vc, vc.getReference(), a, featureContext)))
+ findMatchingResourceLabels(vc, a, featureContext)))
.filter(t -> isExtractUnlabeled || !t.getRight().isEmpty())
.collect(Collectors.toList());
}
- // if variant-type and overlapping-resource checks failed, return an empty list
+ // if variant-type and resource-match checks failed, return an empty list
return Collections.emptyList();
}
- private TreeSet findOverlappingResourceLabels(final VariantContext vc,
- final Allele refAllele,
- final Allele altAllele,
- final FeatureContext featureContext) {
- final TreeSet overlappingResourceLabels = new TreeSet<>();
+ private TreeSet findMatchingResourceLabels(final VariantContext vc,
+ final Allele altAllele,
+ final FeatureContext featureContext) {
+ final TreeSet matchingResourceLabels = new TreeSet<>();
for (final FeatureInput resource : resources) {
final List resourceVCs = featureContext.getValues(resource, featureContext.getInterval().getStart());
for (final VariantContext resourceVC : resourceVCs) {
- if (useASAnnotations && !doAllelesMatch(refAllele, altAllele, resourceVC)) {
+ if (useASAnnotations && !doAllelesMatch(vc.getReference(), altAllele, resourceVC)) {
continue;
}
- if (isValidVariant(vc, resourceVC, !doNotTrustAllPolymorphic)) {
+ if (isMatchingVariant(vc, resourceVC, !doNotTrustAllPolymorphic, resourceMatchingStrategy)) {
resource.getTagAttributes().entrySet().stream()
.filter(e -> e.getValue().equals("true"))
.map(Map.Entry::getKey)
- .forEach(overlappingResourceLabels::add);
+ .forEach(matchingResourceLabels::add);
}
}
}
- return overlappingResourceLabels;
+ return matchingResourceLabels;
}
- private static boolean isValidVariant(final VariantContext vc,
- final VariantContext resourceVC,
- final boolean trustAllPolymorphic) {
- return resourceVC != null && resourceVC.isNotFiltered() && resourceVC.isVariant() && VariantType.checkVariantType(vc, resourceVC) &&
- (trustAllPolymorphic || !resourceVC.hasGenotypes() || resourceVC.isPolymorphicInSamples());
+ private static boolean isMatchingVariant(final VariantContext vc,
+ final VariantContext resourceVC,
+ final boolean trustAllPolymorphic,
+ final ResourceMatchingStrategy resourceMatchingStrategy) {
+ if (resourceVC != null && resourceVC.isNotFiltered() && resourceVC.isVariant() && VariantType.checkVariantType(vc, resourceVC) &&
+ (trustAllPolymorphic || !resourceVC.hasGenotypes() || resourceVC.isPolymorphicInSamples())) { // this is the check originally performed by VQSR
+ switch (resourceMatchingStrategy) {
+ case START_POSITION:
+ return true;
+ case START_POSITION_AND_GIVEN_REPRESENTATION:
+ // we further require that at least one alt allele is present in the resource alt alleles, but don't reconcile representations
+ return !Sets.intersection(Sets.newHashSet(vc.getAlternateAlleles()), Sets.newHashSet(resourceVC.getAlternateAlleles())).isEmpty();
+ case START_POSITION_AND_MINIMAL_REPRESENTATION:
+ // we further require that at least one alt allele is present in the resource alt alleles, and do reconcile representations
+ return vc.getAlternateAlleles().stream()
+ .anyMatch(altAllele -> GATKVariantContextUtils.isAlleleInList(vc.getReference(), altAllele, resourceVC.getReference(), resourceVC.getAlternateAlleles()));
+ default:
+ throw new GATKException.ShouldNeverReachHereException("Unknown ResourceMatchingStrategy.");
+ }
+ }
+ return false;
}
private static boolean doAllelesMatch(final Allele refAllele,
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
index 4aeb0ba236f..fbbbed81faf 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java
@@ -190,12 +190,12 @@
* -A annotation_N \
* --model-prefix model_dir \
* --mode SNP \
- * --resource snp-training,training=true snp-training.vcf \
- * --resource snp-calibration,calibration=true snp-calibration.vcf \
+ * --resource:snp-training,training=true snp-training.vcf \
+ * --resource:snp-calibration,calibration=true snp-calibration.vcf \
* --mode INDEL \
- * --resource indel-training,training=true indel-training.vcf \
- * --resource indel-calibration,calibration=true indel-calibration.vcf \
- * --resource extracted,extracted=true extract.vcf.gz \
+ * --resource:indel-training,training=true indel-training.vcf \
+ * --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ * --resource:extracted,extracted=true extract.vcf.gz \
* --snp-calibration-sensitivity-threshold 0.99 \
* --indel-calibration-sensitivity-threshold 0.99 \
* -O output
@@ -216,9 +216,9 @@
* -A snp_annotation_N \
* --model-prefix model_dir \
* --mode SNP \
- * --resource snp-training,training=true snp-training.vcf \
- * --resource snp-calibration,calibration=true snp-calibration.vcf \
- * --resource extracted,extracted=true snp-extract.vcf.gz \
+ * --resource:snp-training,training=true snp-training.vcf \
+ * --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ * --resource:extracted,extracted=true snp-extract.vcf.gz \
* --snp-calibration-sensitivity-threshold 0.99 \
* -O intermediate-output
*
@@ -229,9 +229,9 @@
* -A indel_annotation_M \
* --model-prefix model_dir \
* --mode INDEL \
- * --resource indel-training,training=true indel-training.vcf \
- * --resource indel-calibration,calibration=true indel-calibration.vcf \
- * --resource extracted,extracted=true indel-extract.vcf.gz \
+ * --resource:indel-training,training=true indel-training.vcf \
+ * --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ * --resource:extracted,extracted=true indel-extract.vcf.gz \
* --indel-calibration-sensitivity-threshold 0.99 \
* -O output
*