-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
remove caching of alleles in AbstractVCFCodec #1282
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -294,16 +294,44 @@ public static boolean wouldBeNoCallAllele(final byte[] bases) { | |||||
|
||||||
/** | ||||||
* @param bases bases representing an allele | ||||||
* @return true if the bases represent a symbolic allele | ||||||
* @return true if the bases represent a symbolic allele, including breakpoints and breakends | ||||||
*/ | ||||||
public static boolean wouldBeSymbolicAllele(final byte[] bases) { | ||||||
if ( bases.length <= 1 ) | ||||||
return false; | ||||||
else { | ||||||
final String strBases = new String(bases); | ||||||
return (bases[0] == '<' || bases[bases.length-1] == '>') || // symbolic or large insertion | ||||||
(bases[0] == '.' || bases[bases.length-1] == '.') || // single breakend | ||||||
(strBases.contains("[") || strBases.contains("]")); // mated breakend | ||||||
return bases[0] == '<' || bases[bases.length - 1] == '>' || | ||||||
wouldBeBreakpoint(bases) || | ||||||
wouldBeSingleBreakend(bases); | ||||||
} | ||||||
} | ||||||
|
||||||
/** | ||||||
* @param bases bases representing an allele | ||||||
* @return true if the bases represent a symbolic allele in breakpoint notation, (ex: G]17:198982] or ]13:123456]T ) | ||||||
*/ | ||||||
public static boolean wouldBeBreakpoint(final byte[] bases) { | ||||||
if ( bases.length <= 1 ) | ||||||
lbergelson marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
return false; | ||||||
else { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unnecessary There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||
for (int i = 0; i < bases.length; i++) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can use java foreach here
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We've been finding noticeable runtime costs for using foreach in hotspots (seemingly because it instantiates a new iterator each time which is much more costly than a fast for loop). Since Alleles are created in a lot of places I figured I'd stick with the for loop even though it's gross... That said, I haven't profiled this particular instance in any sensible way so I can't prove that this is any faster. |
||||||
final byte base = bases[i]; | ||||||
if (base == '[' || base == ']') { | ||||||
return true; | ||||||
} | ||||||
} | ||||||
return false; | ||||||
} | ||||||
} | ||||||
/** | ||||||
* @param bases bases representing an allele | ||||||
* @return true if the bases represent a symbolic allele in single breakend notation (ex: .A or A. ) | ||||||
*/ | ||||||
public static boolean wouldBeSingleBreakend(final byte[] bases) { | ||||||
if ( bases.length <= 1 ) | ||||||
return false; | ||||||
else { | ||||||
return bases[0] == '.' || bases[bases.length - 1] == '.'; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to use constants instead of the magic characters There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've extracted some... I can't decide if it's an improvement or not... You can take a look when I push the changes. |
||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -408,19 +436,26 @@ public static Allele create(final Allele allele, final boolean ignoreRefState) { | |||||
// | ||||||
// --------------------------------------------------------------------------------------------------------- | ||||||
|
||||||
// Returns true if this is the NO_CALL allele | ||||||
/** @return true if this is the NO_CALL allele */ | ||||||
public boolean isNoCall() { return isNoCall; } | ||||||
// Returns true if this is not the NO_CALL allele | ||||||
public boolean isCalled() { return ! isNoCall(); } | ||||||
|
||||||
// Returns true if this Allele is the reference allele | ||||||
/** @return true if this Allele is the reference allele */ | ||||||
public boolean isReference() { return isRef; } | ||||||
// Returns true if this Allele is not the reference allele | ||||||
|
||||||
/** @return true if this Allele is not the reference allele */ | ||||||
public boolean isNonReference() { return ! isReference(); } | ||||||
|
||||||
// Returns true if this Allele is symbolic (i.e. no well-defined base sequence) | ||||||
/** @return true if this Allele is symbolic (i.e. no well-defined base sequence), this includes breakpoints and breakends */ | ||||||
public boolean isSymbolic() { return isSymbolic; } | ||||||
|
||||||
/** @return true if this Allele is a breakpoint ( ex: G]17:198982] or ]13:123456]T ) */ | ||||||
public boolean isBreakpoint() { return wouldBeBreakpoint(bases); } | ||||||
|
||||||
/** @return true if this Allele is a single breakend (ex: .A or A.) */ | ||||||
public boolean isSingleBreakend() { return wouldBeSingleBreakend(bases); } | ||||||
|
||||||
// Returns a nice string representation of this object | ||||||
public String toString() { | ||||||
return ( isNoCall() ? NO_CALL_STRING : getDisplayString() ) + (isReference() ? "*" : ""); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,7 +64,7 @@ | |
public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext> implements NameAwareCodec { | ||
public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); | ||
|
||
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column | ||
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th | ||
|
||
// we have to store the list of strings that make up the header until they're needed | ||
protected VCFHeader header = null; | ||
|
@@ -332,12 +332,12 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) | |
else | ||
builder.id(parts[2]); | ||
|
||
final String ref = getCachedString(parts[3].toUpperCase()); | ||
final String alts = getCachedString(parts[4]); | ||
final String ref = parts[3].toUpperCase(); | ||
final String alts = parts[4]; | ||
builder.log10PError(parseQual(parts[5])); | ||
|
||
final List<String> filters = parseFilters(getCachedString(parts[6])); | ||
if ( filters != null ) builder.filters(new HashSet<String>(filters)); | ||
if ( filters != null ) builder.filters(new HashSet<>(filters)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. may as well format the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
final Map<String, Object> attrs = parseInfo(parts[7]); | ||
builder.attributes(attrs); | ||
|
||
|
@@ -577,7 +577,7 @@ private static void checkAllele(String allele, boolean isRef, int lineNo) { | |
System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); | ||
} | ||
|
||
if ( isSymbolicAllele(allele) ) { | ||
if (Allele.wouldBeSymbolicAllele(allele.getBytes())) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This check is done before calling the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, there's a big redundant section of allele checking code here that I believe is redundant with the constructor. I wanted to make this as small a change set as possible though. I started a follow up branch that cleans up some of this stuff but I figured I'd get this one in before that. |
||
if ( isRef ) { | ||
generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); | ||
} | ||
|
@@ -609,18 +609,6 @@ private static String generateExceptionTextForBadAlleleBases(final String allele | |
return "unparsable vcf record with allele " + allele; | ||
} | ||
|
||
/** | ||
* return true if this is a symbolic allele (e.g. <SOMETAG>) or | ||
* structural variation breakend (with [ or ]), otherwise false | ||
* @param allele the allele to check | ||
* @return true if the allele is a symbolic allele, otherwise false | ||
*/ | ||
private static boolean isSymbolicAllele(String allele) { | ||
return (allele != null && allele.length() > 2 && | ||
((allele.startsWith("<") && allele.endsWith(">")) || | ||
(allele.contains("[") || allele.contains("]")))); | ||
} | ||
|
||
/** | ||
* parse a single allele, given the allele list | ||
* @param alleles the alleles available | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -277,4 +277,38 @@ public void testExtend() { | |||||
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString()); | ||||||
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString()); | ||||||
} | ||||||
@Test | ||||||
public void testWouldBeSymbolic() { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could rewrite these tests using a testng data provider
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, I didn't do all the tests in this file though, just the bottom ones.. |
||||||
Assert.assertTrue(Allele.wouldBeSymbolicAllele("<DEL>".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeSymbolicAllele("AAAAAA[chr1:1234[".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeSymbolicAllele("AAAAAA]chr1:1234]".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeSymbolicAllele("A.".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeSymbolicAllele(".A".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeSymbolicAllele("AA".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeSymbolicAllele("A".getBytes())); | ||||||
} | ||||||
@Test | ||||||
public void testWouldBeBreakpoint() { | ||||||
Assert.assertTrue(Allele.wouldBeBreakpoint("G]17:198982]".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeBreakpoint("]13:123456]T".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeBreakpoint("<DEL>".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeBreakpoint("AAAAAA[chr1:1234[".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeBreakpoint("AAAAAA]chr1:1234]".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeBreakpoint("A.".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeBreakpoint(".A".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeBreakpoint("AA".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeBreakpoint("A".getBytes())); | ||||||
} | ||||||
@Test | ||||||
public void testWouldBeBreakend() { | ||||||
Assert.assertFalse(Allele.wouldBeSingleBreakend("G]17:198982]".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeSingleBreakend("]13:123456]T".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeSingleBreakend("<DEL>".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeSingleBreakend("AAAAAA[chr1:1234[".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeSingleBreakend("AAAAAA]chr1:1234]".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeSingleBreakend("A.".getBytes())); | ||||||
Assert.assertTrue(Allele.wouldBeSingleBreakend(".A".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeSingleBreakend("AA".getBytes())); | ||||||
Assert.assertFalse(Allele.wouldBeSingleBreakend("A".getBytes())); | ||||||
} | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The use of
would
here confuses me. If this is a breakpoint allele string, then why notis
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, it's weird, but it matches the existing methods. I think changing 1 of them is worse than having this weird convention. We could change all of them though...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's ok, I didn't see the rest of them.