Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove caching of alleles in AbstractVCFCodec #1282

Merged
merged 5 commits into from
Feb 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 48 additions & 9 deletions src/main/java/htsjdk/variant/variantcontext/Allele.java
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ public class Allele implements Comparable<Allele>, Serializable {
public static final long serialVersionUID = 1L;

private static final byte[] EMPTY_ALLELE_BASES = new byte[0];
private static final char SINGLE_BREAKEND_INDICATOR = '.';
private static final char BREAKEND_EXTENDING_RIGHT = '[';
private static final char BREAKEND_EXTENDING_LEFT = ']';
private static final char SYMBOLIC_ALLELE_START = '<';
private static final char SYMBOLIC_ALLELE_END = '>';

private boolean isRef = false;
private boolean isNoCall = false;
Expand Down Expand Up @@ -294,16 +299,43 @@ public static boolean wouldBeNoCallAllele(final byte[] bases) {

/**
* @param bases bases representing an allele
* @return true if the bases represent a symbolic allele
* @return true if the bases represent a symbolic allele, including breakpoints and breakends
*/
public static boolean wouldBeSymbolicAllele(final byte[] bases) {
if ( bases.length <= 1 )
return false;
else {
final String strBases = new String(bases);
return (bases[0] == '<' || bases[bases.length-1] == '>') || // symbolic or large insertion
(bases[0] == '.' || bases[bases.length-1] == '.') || // single breakend
(strBases.contains("[") || strBases.contains("]")); // mated breakend
return bases[0] == SYMBOLIC_ALLELE_START || bases[bases.length - 1] == SYMBOLIC_ALLELE_END ||
wouldBeBreakpoint(bases) ||
wouldBeSingleBreakend(bases);
}
}

/**
* @param bases bases representing an allele
* @return true if the bases represent a symbolic allele in breakpoint notation, (ex: G]17:198982] or ]13:123456]T )
*/
public static boolean wouldBeBreakpoint(final byte[] bases) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The use of would here confuses me. If this is a breakpoint allele string, then why not is?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it's weird, but it matches the existing methods. I think changing 1 of them is worse than having this weird convention. We could change all of them though...

Copy link
Contributor

@pshapiro4broad pshapiro4broad Feb 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's ok, I didn't see the rest of them.

if (bases.length <= 1) {
return false;
}
for (int i = 0; i < bases.length; i++) {
final byte base = bases[i];
if (base == BREAKEND_EXTENDING_LEFT || base == BREAKEND_EXTENDING_RIGHT) {
return true;
}
}
return false;
}
/**
* @param bases bases representing an allele
* @return true if the bases represent a symbolic allele in single breakend notation (ex: .A or A. )
*/
public static boolean wouldBeSingleBreakend(final byte[] bases) {
if ( bases.length <= 1 )
return false;
else {
return bases[0] == SINGLE_BREAKEND_INDICATOR || bases[bases.length - 1] == SINGLE_BREAKEND_INDICATOR;
}
}

Expand Down Expand Up @@ -408,19 +440,26 @@ public static Allele create(final Allele allele, final boolean ignoreRefState) {
//
// ---------------------------------------------------------------------------------------------------------

// Returns true if this is the NO_CALL allele
/** @return true if this is the NO_CALL allele */
public boolean isNoCall() { return isNoCall; }
// Returns true if this is not the NO_CALL allele
public boolean isCalled() { return ! isNoCall(); }

// Returns true if this Allele is the reference allele
/** @return true if this Allele is the reference allele */
public boolean isReference() { return isRef; }
// Returns true if this Allele is not the reference allele

/** @return true if this Allele is not the reference allele */
public boolean isNonReference() { return ! isReference(); }

// Returns true if this Allele is symbolic (i.e. no well-defined base sequence)
/** @return true if this Allele is symbolic (i.e. no well-defined base sequence), this includes breakpoints and breakends */
public boolean isSymbolic() { return isSymbolic; }

/** @return true if this Allele is a breakpoint ( ex: G]17:198982] or ]13:123456]T ) */
public boolean isBreakpoint() { return wouldBeBreakpoint(bases); }

/** @return true if this Allele is a single breakend (ex: .A or A.) */
public boolean isSingleBreakend() { return wouldBeSingleBreakend(bases); }

// Returns a nice string representation of this object
public String toString() {
return ( isNoCall() ? NO_CALL_STRING : getDisplayString() ) + (isReference() ? "*" : "");
Expand Down
24 changes: 7 additions & 17 deletions src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
public abstract class AbstractVCFCodec extends AsciiFeatureCodec<VariantContext> implements NameAwareCodec {
public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20);

protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column
protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th

// we have to store the list of strings that make up the header until they're needed
protected VCFHeader header = null;
Expand Down Expand Up @@ -332,12 +332,14 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) )
else
builder.id(parts[2]);

final String ref = getCachedString(parts[3].toUpperCase());
final String alts = getCachedString(parts[4]);
final String ref = parts[3].toUpperCase();
final String alts = parts[4];
builder.log10PError(parseQual(parts[5]));

final List<String> filters = parseFilters(getCachedString(parts[6]));
if ( filters != null ) builder.filters(new HashSet<String>(filters));
if ( filters != null ) {
builder.filters(new HashSet<>(filters));
}
final Map<String, Object> attrs = parseInfo(parts[7]);
builder.attributes(attrs);

Expand Down Expand Up @@ -577,7 +579,7 @@ private static void checkAllele(String allele, boolean isRef, int lineNo) {
System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo));
}

if ( isSymbolicAllele(allele) ) {
if (Allele.wouldBeSymbolicAllele(allele.getBytes())) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This check is done before calling the Allele constructor, but it appears to be ensuring that we create a valid Allele. Is there a reason why this code isn't part of the Allele constructor? We would have to wrap the constructor call in a try/catch to get a message printed out with the VCF line number, though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, there's a big redundant section of allele checking code here that I believe is redundant with the constructor. I wanted to make this as small a change set as possible though. I started a follow up branch that cleans up some of this stuff but I figured I'd get this one in before that.

if ( isRef ) {
generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo);
}
Expand Down Expand Up @@ -609,18 +611,6 @@ private static String generateExceptionTextForBadAlleleBases(final String allele
return "unparsable vcf record with allele " + allele;
}

/**
* return true if this is a symbolic allele (e.g. <SOMETAG>) or
* structural variation breakend (with [ or ]), otherwise false
* @param allele the allele to check
* @return true if the allele is a symbolic allele, otherwise false
*/
private static boolean isSymbolicAllele(String allele) {
return (allele != null && allele.length() > 2 &&
((allele.startsWith("<") && allele.endsWith(">")) ||
(allele.contains("[") || allele.contains("]"))));
}

/**
* parse a single allele, given the allele list
* @param alleles the alleles available
Expand Down
54 changes: 47 additions & 7 deletions src/test/java/htsjdk/variant/variantcontext/AlleleUnitTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@

import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

// public Allele(byte[] bases, boolean isRef) {
Expand Down Expand Up @@ -269,12 +270,51 @@ public void testBadSpanningDeletionAllelel() {
Allele.create(Allele.SPAN_DEL_STRING, true); // spanning deletion cannot be ref allele
}

@Test
public void testExtend() {
Assert.assertEquals("AT", Allele.extend(Allele.create("A"), "T".getBytes()).toString());
Assert.assertEquals("ATA", Allele.extend(Allele.create("A"), "TA".getBytes()).toString());
Assert.assertEquals("A", Allele.extend(Allele.NO_CALL, "A".getBytes()).toString());
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString());
Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString());
@DataProvider
public Object[][] getExtendTests() {
return new Object[][]{
{Allele.create("A"), "T", "AT"},
{Allele.create("A"), "TA", "ATA"},
{Allele.NO_CALL, "A", "A"},
{Allele.create("AT"), "CGA", "ATCGA"},
{Allele.create("ATC"), "GA", "ATCGA"}
};
}

@Test(dataProvider = "getExtendTests")
public void testExtend(Allele toExtend, String extension, String expected) {
final Allele extended = Allele.extend(toExtend, extension.getBytes());
Assert.assertEquals(extended, Allele.create(expected));
}

@DataProvider
public Object[][] getTestCasesForCheckingSymbolicAlleles(){
return new Object[][]{
//allele, isSymbolic, isBreakpoint, isSingleBreakend
{"<DEL>", true, false, false},
{"G]17:198982]", true, true, false},
{"]13:123456]T", true, true, false},
{"AAAAAA[chr1:1234[", true, true, false},
{"AAAAAA]chr1:1234]", true, true, false},
{"A.", true, false, true},
{".A", true, false, true},
{"AA", false, false, false},
{"A", false, false, false}
};
}

@Test(dataProvider = "getTestCasesForCheckingSymbolicAlleles")
public void testWouldBeSymbolic(String baseString, boolean isSymbolic, boolean isBreakpoint, boolean isBreakend) {
Assert.assertEquals(Allele.wouldBeSymbolicAllele(baseString.getBytes()), isSymbolic);
}

@Test(dataProvider = "getTestCasesForCheckingSymbolicAlleles")
public void testWouldBeBreakpoint(String baseString, boolean isSymbolic, boolean isBreakpoint, boolean isBreakend) {
Assert.assertEquals(Allele.wouldBeBreakpoint(baseString.getBytes()), isBreakpoint);
}

@Test(dataProvider = "getTestCasesForCheckingSymbolicAlleles")
public void testWouldBeBreakend(String baseString, boolean isSymbolic, boolean isBreakpoint, boolean isBreakend) {
Assert.assertEquals(Allele.wouldBeSingleBreakend(baseString.getBytes()), isBreakend);
}
}