-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
make VariantContextBuilder safer #1344
Merged
Merged
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
625e4f4
-added some testing around VariantContextBuilder, to show that many u…
dc03bb7
- tests were bad....results now are not as scary.
5d310a2
- more tests to cover bad useages
143fc8e
- fixes broken tests except for the genotype context ones
4eb67f5
- fixes broken GenotypeContext test.
46e5b9c
- had a problem with unfiltered versus filtered with empty maps.
7c91604
- fixed straggling failing test
3a7518e
- responding to review comments
2aa9d3b
- added a test from issue https://github.com/samtools/htsjdk/issues/1365
b2730d8
- responding to review comments
266a62f
- responded to reivew comments.
0a03bda
- made comment and test clearer
74eea13
- oops typo in unrelated file
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,7 +36,17 @@ | |
import htsjdk.variant.vcf.VCFHeaderLineType; | ||
|
||
import java.io.Serializable; | ||
import java.util.*; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.Collections; | ||
import java.util.Comparator; | ||
import java.util.EnumSet; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import java.util.regex.Pattern; | ||
import java.util.stream.Collectors; | ||
|
||
/** | ||
|
@@ -219,33 +229,33 @@ | |
public class VariantContext implements Feature, Serializable { | ||
public static final long serialVersionUID = 1L; | ||
|
||
private final static boolean WARN_ABOUT_BAD_END = true; | ||
private final static int MAX_ALLELE_SIZE_FOR_NON_SV = 150; | ||
private static final boolean WARN_ABOUT_BAD_END = true; | ||
private static final int MAX_ALLELE_SIZE_FOR_NON_SV = 150; | ||
private boolean fullyDecoded = false; | ||
protected CommonInfo commonInfo = null; | ||
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; | ||
public static final double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; | ||
|
||
public final static Set<String> PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet<String>()); | ||
public static final Set<String> PASSES_FILTERS = Collections.emptySet(); | ||
|
||
/** The location of this VariantContext */ | ||
final protected String contig; | ||
final protected long start; | ||
final protected long stop; | ||
protected final String contig; | ||
protected final long start; | ||
protected final long stop; | ||
private final String ID; | ||
|
||
/** The type (cached for performance reasons) of this context */ | ||
protected Type type = null; | ||
|
||
/** A set of the alleles segregating in this context */ | ||
final protected List<Allele> alleles; | ||
protected final List<Allele> alleles; | ||
|
||
/** A mapping from sampleName -> genotype objects for all genotypes associated with this context */ | ||
protected GenotypesContext genotypes = null; | ||
|
||
/** Counts for each of the possible Genotype types in this context */ | ||
protected int[] genotypeCounts = null; | ||
|
||
public final static GenotypesContext NO_GENOTYPES = GenotypesContext.NO_GENOTYPES; | ||
public static final GenotypesContext NO_GENOTYPES = GenotypesContext.NO_GENOTYPES; | ||
|
||
// a fast cached access point to the ref / alt alleles for biallelic case | ||
private Allele REF = null; | ||
|
@@ -257,9 +267,10 @@ public class VariantContext implements Feature, Serializable { | |
private Boolean monomorphic = null; | ||
|
||
/* | ||
* Determine which genotype fields are in use in the genotypes in VC | ||
* @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first | ||
*/ | ||
* Determine which genotype fields are in use in the genotypes in VC | ||
* @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first | ||
*/ | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. weird extra linebreaks. |
||
public List<String> calcVCFGenotypeKeys(final VCFHeader header) { | ||
final Set<String> keys = new HashSet<>(); | ||
|
||
|
@@ -310,12 +321,91 @@ public List<String> calcVCFGenotypeKeys(final VCFHeader header) { | |
// | ||
// --------------------------------------------------------------------------------------------------------- | ||
|
||
//no controls and white-spaces characters, no semicolon. | ||
public static final Pattern VALID_FILTER = Pattern.compile("^[!-:<-~]+$"); | ||
|
||
public enum Validation { | ||
ALLELES, | ||
GENOTYPES | ||
ALLELES() { | ||
void validate(VariantContext variantContext) { | ||
validateAlleles(variantContext); | ||
} | ||
}, | ||
GENOTYPES() { | ||
void validate(VariantContext variantContext) { | ||
validateGenotypes(variantContext); | ||
} | ||
}, | ||
FILTERS { | ||
void validate(VariantContext variantContext) { | ||
validateFilters(variantContext); | ||
} | ||
}; | ||
|
||
abstract void validate(VariantContext variantContext); | ||
|
||
|
||
private static void validateAlleles(final VariantContext vc) { | ||
|
||
boolean alreadySeenRef = false; | ||
|
||
for (final Allele allele : vc.alleles) { | ||
// make sure there's only one reference allele | ||
if (allele.isReference()) { | ||
if (alreadySeenRef) { | ||
throw new IllegalArgumentException("BUG: Received two reference tagged alleles in VariantContext " + vc.alleles + " vc=" + vc); | ||
} | ||
alreadySeenRef = true; | ||
} | ||
|
||
if (allele.isNoCall()) { | ||
throw new IllegalArgumentException("BUG: Cannot add a no call allele to a variant context " + vc.alleles + " vc=" + vc); | ||
} | ||
} | ||
|
||
// make sure there's one reference allele | ||
if (!alreadySeenRef) { | ||
throw new IllegalArgumentException("No reference allele found in VariantContext"); | ||
} | ||
} | ||
|
||
private static void validateGenotypes(final VariantContext variantContext) { | ||
|
||
final ArrayList<Genotype> genotypes = variantContext.genotypes.getGenotypes(); | ||
|
||
if (genotypes == null) { | ||
throw new IllegalStateException("Genotypes is null"); | ||
} | ||
|
||
for (int i = 0; i < genotypes.size(); i++) { | ||
final Genotype genotype = genotypes.get(i); | ||
if (genotype.isAvailable()) { | ||
final List<Allele> alleles = genotype.getAlleles(); | ||
for (int j = 0, size = alleles.size(); j < size; j++) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that I'm reading this code, due to the move, I have a comment. It looks like this can be replaced with a foreach loop, e.g.
|
||
final Allele gAllele = alleles.get(j); | ||
if (!variantContext.hasAllele(gAllele) && gAllele.isCalled()) { | ||
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
private static void validateFilters(final VariantContext variantContext) { | ||
final Set<String> filters = variantContext.getFilters(); | ||
if (filters == null) { | ||
return; | ||
} | ||
|
||
for (String filter : filters) { | ||
if (!VALID_FILTER.matcher(filter).matches()) { | ||
throw new IllegalStateException("Filter '" + filter + | ||
"' contains an illegal character. It must conform to the regex ;'" + VALID_FILTER); | ||
} | ||
} | ||
} | ||
} | ||
|
||
private final static EnumSet<Validation> NO_VALIDATION = EnumSet.noneOf(Validation.class); | ||
private static final EnumSet<Validation> NO_VALIDATION = EnumSet.noneOf(Validation.class); | ||
|
||
// --------------------------------------------------------------------------------------------------------- | ||
// | ||
|
@@ -1202,7 +1292,7 @@ public void validateAlternateAlleles() { | |
|
||
// maintain a list of non-symbolic alleles expected in the REF and ALT fields of the record | ||
// (we exclude symbolic alleles because it's commonly expected that they don't show up in the genotypes, e.g. with GATK gVCFs) | ||
final List<Allele> reportedAlleles = new ArrayList<Allele>(); | ||
final List<Allele> reportedAlleles = new ArrayList<>(); | ||
for ( final Allele allele : getAlleles() ) { | ||
if ( !allele.isSymbolic() ) | ||
reportedAlleles.add(allele); | ||
|
@@ -1290,17 +1380,9 @@ public void validateChromosomeCounts() { | |
// | ||
// --------------------------------------------------------------------------------------------------------- | ||
|
||
private boolean validate(final EnumSet<Validation> validationToPerform) { | ||
private void validate(final EnumSet<Validation> validationsToPerform) { | ||
validateStop(); | ||
for (final Validation val : validationToPerform ) { | ||
switch (val) { | ||
case ALLELES: validateAlleles(); break; | ||
case GENOTYPES: validateGenotypes(); break; | ||
default: throw new IllegalArgumentException("Unexpected validation mode " + val); | ||
} | ||
} | ||
|
||
return true; | ||
validationsToPerform.forEach(v->v.validate(this)); | ||
} | ||
|
||
/** | ||
|
@@ -1316,56 +1398,18 @@ private void validateStop() { | |
+ " but this VariantContext contains an END key with value " + end; | ||
if ( GeneralUtils.DEBUG_MODE_ENABLED && WARN_ABOUT_BAD_END ) { | ||
System.err.println(message); | ||
} | ||
else { | ||
} else { | ||
throw new TribbleException(message); | ||
} | ||
} | ||
} else { | ||
final long length = (stop - start) + 1; | ||
if ( ! hasSymbolicAlleles() && length != getReference().length() ) { | ||
if (!hasSymbolicAlleles() && length != getReference().length()) { | ||
throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this); | ||
} | ||
} | ||
} | ||
|
||
private void validateAlleles() { | ||
|
||
boolean alreadySeenRef = false; | ||
|
||
for ( final Allele allele : alleles ) { | ||
// make sure there's only one reference allele | ||
if ( allele.isReference() ) { | ||
if ( alreadySeenRef ) throw new IllegalArgumentException("BUG: Received two reference tagged alleles in VariantContext " + alleles + " this=" + this); | ||
alreadySeenRef = true; | ||
} | ||
|
||
if ( allele.isNoCall() ) { | ||
throw new IllegalArgumentException("BUG: Cannot add a no call allele to a variant context " + alleles + " this=" + this); | ||
} | ||
} | ||
|
||
// make sure there's one reference allele | ||
if ( ! alreadySeenRef ) | ||
throw new IllegalArgumentException("No reference allele found in VariantContext"); | ||
} | ||
|
||
private void validateGenotypes() { | ||
if ( this.genotypes == null ) throw new IllegalStateException("Genotypes is null"); | ||
|
||
for ( int i = 0; i < genotypes.size(); i++ ) { | ||
Genotype genotype = genotypes.get(i); | ||
if ( genotype.isAvailable() ) { | ||
final List<Allele> alleles = genotype.getAlleles(); | ||
for ( int j = 0, size = alleles.size(); j < size; j++ ) { | ||
final Allele gAllele = alleles.get(j); | ||
if ( ! hasAllele(gAllele) && gAllele.isCalled() ) | ||
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles); | ||
} | ||
} | ||
} | ||
} | ||
|
||
// --------------------------------------------------------------------------------------------------------- | ||
// | ||
// utility routines | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
{}