diff --git a/src/main/java/htsjdk/variant/variantcontext/Allele.java b/src/main/java/htsjdk/variant/variantcontext/Allele.java index 0151a21af2..387a8f7437 100644 --- a/src/main/java/htsjdk/variant/variantcontext/Allele.java +++ b/src/main/java/htsjdk/variant/variantcontext/Allele.java @@ -118,6 +118,11 @@ public class Allele implements Comparable, Serializable { public static final long serialVersionUID = 1L; private static final byte[] EMPTY_ALLELE_BASES = new byte[0]; + private static final char SINGLE_BREAKEND_INDICATOR = '.'; + private static final char BREAKEND_EXTENDING_RIGHT = '['; + private static final char BREAKEND_EXTENDING_LEFT = ']'; + private static final char SYMBOLIC_ALLELE_START = '<'; + private static final char SYMBOLIC_ALLELE_END = '>'; private boolean isRef = false; private boolean isNoCall = false; @@ -294,16 +299,43 @@ public static boolean wouldBeNoCallAllele(final byte[] bases) { /** * @param bases bases representing an allele - * @return true if the bases represent a symbolic allele + * @return true if the bases represent a symbolic allele, including breakpoints and breakends */ public static boolean wouldBeSymbolicAllele(final byte[] bases) { if ( bases.length <= 1 ) return false; else { - final String strBases = new String(bases); - return (bases[0] == '<' || bases[bases.length-1] == '>') || // symbolic or large insertion - (bases[0] == '.' || bases[bases.length-1] == '.') || // single breakend - (strBases.contains("[") || strBases.contains("]")); // mated breakend + return bases[0] == SYMBOLIC_ALLELE_START || bases[bases.length - 1] == SYMBOLIC_ALLELE_END || + wouldBeBreakpoint(bases) || + wouldBeSingleBreakend(bases); + } + } + + /** + * @param bases bases representing an allele + * @return true if the bases represent a symbolic allele in breakpoint notation, (ex: G]17:198982] or ]13:123456]T ) + */ + public static boolean wouldBeBreakpoint(final byte[] bases) { + if (bases.length <= 1) { + return false; + } + for (int i = 0; i < bases.length; i++) { + final byte base = bases[i]; + if (base == BREAKEND_EXTENDING_LEFT || base == BREAKEND_EXTENDING_RIGHT) { + return true; + } + } + return false; + } + /** + * @param bases bases representing an allele + * @return true if the bases represent a symbolic allele in single breakend notation (ex: .A or A. ) + */ + public static boolean wouldBeSingleBreakend(final byte[] bases) { + if ( bases.length <= 1 ) + return false; + else { + return bases[0] == SINGLE_BREAKEND_INDICATOR || bases[bases.length - 1] == SINGLE_BREAKEND_INDICATOR; } } @@ -408,19 +440,26 @@ public static Allele create(final Allele allele, final boolean ignoreRefState) { // // --------------------------------------------------------------------------------------------------------- - // Returns true if this is the NO_CALL allele + /** @return true if this is the NO_CALL allele */ public boolean isNoCall() { return isNoCall; } // Returns true if this is not the NO_CALL allele public boolean isCalled() { return ! isNoCall(); } - // Returns true if this Allele is the reference allele + /** @return true if this Allele is the reference allele */ public boolean isReference() { return isRef; } - // Returns true if this Allele is not the reference allele + + /** @return true if this Allele is not the reference allele */ public boolean isNonReference() { return ! isReference(); } - // Returns true if this Allele is symbolic (i.e. no well-defined base sequence) + /** @return true if this Allele is symbolic (i.e. no well-defined base sequence), this includes breakpoints and breakends */ public boolean isSymbolic() { return isSymbolic; } + /** @return true if this Allele is a breakpoint ( ex: G]17:198982] or ]13:123456]T ) */ + public boolean isBreakpoint() { return wouldBeBreakpoint(bases); } + + /** @return true if this Allele is a single breakend (ex: .A or A.) */ + public boolean isSingleBreakend() { return wouldBeSingleBreakend(bases); } + // Returns a nice string representation of this object public String toString() { return ( isNoCall() ? NO_CALL_STRING : getDisplayString() ) + (isReference() ? "*" : ""); diff --git a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java index 96d9b9c17f..bfa48b4c87 100644 --- a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java @@ -64,7 +64,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec implements NameAwareCodec { public final static int MAX_ALLELE_SIZE_BEFORE_WARNING = (int)Math.pow(2, 20); - protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th column + protected final static int NUM_STANDARD_FIELDS = 8; // INFO is the 8th // we have to store the list of strings that make up the header until they're needed protected VCFHeader header = null; @@ -332,12 +332,14 @@ else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) else builder.id(parts[2]); - final String ref = getCachedString(parts[3].toUpperCase()); - final String alts = getCachedString(parts[4]); + final String ref = parts[3].toUpperCase(); + final String alts = parts[4]; builder.log10PError(parseQual(parts[5])); final List filters = parseFilters(getCachedString(parts[6])); - if ( filters != null ) builder.filters(new HashSet(filters)); + if ( filters != null ) { + builder.filters(new HashSet<>(filters)); + } final Map attrs = parseInfo(parts[7]); builder.attributes(attrs); @@ -577,7 +579,7 @@ private static void checkAllele(String allele, boolean isRef, int lineNo) { System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); } - if ( isSymbolicAllele(allele) ) { + if (Allele.wouldBeSymbolicAllele(allele.getBytes())) { if ( isRef ) { generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); } @@ -609,18 +611,6 @@ private static String generateExceptionTextForBadAlleleBases(final String allele return "unparsable vcf record with allele " + allele; } - /** - * return true if this is a symbolic allele (e.g. ) or - * structural variation breakend (with [ or ]), otherwise false - * @param allele the allele to check - * @return true if the allele is a symbolic allele, otherwise false - */ - private static boolean isSymbolicAllele(String allele) { - return (allele != null && allele.length() > 2 && - ((allele.startsWith("<") && allele.endsWith(">")) || - (allele.contains("[") || allele.contains("]")))); - } - /** * parse a single allele, given the allele list * @param alleles the alleles available diff --git a/src/test/java/htsjdk/variant/variantcontext/AlleleUnitTest.java b/src/test/java/htsjdk/variant/variantcontext/AlleleUnitTest.java index e96c2680d8..2de5373f19 100644 --- a/src/test/java/htsjdk/variant/variantcontext/AlleleUnitTest.java +++ b/src/test/java/htsjdk/variant/variantcontext/AlleleUnitTest.java @@ -32,6 +32,7 @@ import org.testng.Assert; import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; // public Allele(byte[] bases, boolean isRef) { @@ -269,12 +270,51 @@ public void testBadSpanningDeletionAllelel() { Allele.create(Allele.SPAN_DEL_STRING, true); // spanning deletion cannot be ref allele } - @Test - public void testExtend() { - Assert.assertEquals("AT", Allele.extend(Allele.create("A"), "T".getBytes()).toString()); - Assert.assertEquals("ATA", Allele.extend(Allele.create("A"), "TA".getBytes()).toString()); - Assert.assertEquals("A", Allele.extend(Allele.NO_CALL, "A".getBytes()).toString()); - Assert.assertEquals("ATCGA", Allele.extend(Allele.create("AT"), "CGA".getBytes()).toString()); - Assert.assertEquals("ATCGA", Allele.extend(Allele.create("ATC"), "GA".getBytes()).toString()); + @DataProvider + public Object[][] getExtendTests() { + return new Object[][]{ + {Allele.create("A"), "T", "AT"}, + {Allele.create("A"), "TA", "ATA"}, + {Allele.NO_CALL, "A", "A"}, + {Allele.create("AT"), "CGA", "ATCGA"}, + {Allele.create("ATC"), "GA", "ATCGA"} + }; + } + + @Test(dataProvider = "getExtendTests") + public void testExtend(Allele toExtend, String extension, String expected) { + final Allele extended = Allele.extend(toExtend, extension.getBytes()); + Assert.assertEquals(extended, Allele.create(expected)); + } + + @DataProvider + public Object[][] getTestCasesForCheckingSymbolicAlleles(){ + return new Object[][]{ + //allele, isSymbolic, isBreakpoint, isSingleBreakend + {"", true, false, false}, + {"G]17:198982]", true, true, false}, + {"]13:123456]T", true, true, false}, + {"AAAAAA[chr1:1234[", true, true, false}, + {"AAAAAA]chr1:1234]", true, true, false}, + {"A.", true, false, true}, + {".A", true, false, true}, + {"AA", false, false, false}, + {"A", false, false, false} + }; + } + + @Test(dataProvider = "getTestCasesForCheckingSymbolicAlleles") + public void testWouldBeSymbolic(String baseString, boolean isSymbolic, boolean isBreakpoint, boolean isBreakend) { + Assert.assertEquals(Allele.wouldBeSymbolicAllele(baseString.getBytes()), isSymbolic); + } + + @Test(dataProvider = "getTestCasesForCheckingSymbolicAlleles") + public void testWouldBeBreakpoint(String baseString, boolean isSymbolic, boolean isBreakpoint, boolean isBreakend) { + Assert.assertEquals(Allele.wouldBeBreakpoint(baseString.getBytes()), isBreakpoint); + } + + @Test(dataProvider = "getTestCasesForCheckingSymbolicAlleles") + public void testWouldBeBreakend(String baseString, boolean isSymbolic, boolean isBreakpoint, boolean isBreakend) { + Assert.assertEquals(Allele.wouldBeSingleBreakend(baseString.getBytes()), isBreakend); } }