Skip to content

Commit

Permalink
Tolerate mixed case NaNs and Infinities in VCF (#1364)
Browse files Browse the repository at this point in the history
* Different languages convert floating point numbers to String representations in different ways.  We now accept NAN, INF, or INFINITY in any case instead of only NaN and Inf when reading VCFs.
*  See samtools/hts-specs#409 for more discussion.
  • Loading branch information
karenfeng authored and lbergelson committed Jun 24, 2019
1 parent f68108d commit 37ea940
Show file tree
Hide file tree
Showing 13 changed files with 129 additions and 12 deletions.
5 changes: 3 additions & 2 deletions src/main/java/htsjdk/variant/variantcontext/CommonInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@


import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFUtils;

import java.io.Serializable;
import java.util.Arrays;
Expand Down Expand Up @@ -296,7 +297,7 @@ public List<Double> getAttributeAsDoubleList(String key, Double defaultValue) {
} else if (x instanceof Number) {
return ((Number) x).doubleValue();
} else {
return Double.valueOf((String)x); // throws an exception if this isn't a string
return VCFUtils.parseVcfDouble((String)x); // throws an exception if this isn't a string
}
});
}
Expand All @@ -320,7 +321,7 @@ public double getAttributeAsDouble(String key, double defaultValue) {
if ( x == null ) return defaultValue;
if ( x instanceof Double ) return (Double)x;
if ( x instanceof Integer ) return (Integer)x;
return Double.valueOf((String)x); // throws an exception if this isn't a string
return VCFUtils.parseVcfDouble((String)x); // throws an exception if this isn't a string
}

public boolean getAttributeAsBoolean(String key, boolean defaultValue) {
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/htsjdk/variant/variantcontext/Genotype.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFUtils;

import java.io.Serializable;
import java.util.ArrayList;
Expand Down Expand Up @@ -526,7 +527,7 @@ public double getAttributeAsDouble(String key, double defaultValue) {
Object x = getExtendedAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Double ) return (Double)x;
return Double.valueOf((String)x); // throws an exception if this isn't a string
return VCFUtils.parseVcfDouble((String) x); // throws an exception if this isn't a string
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import htsjdk.tribble.TribbleException;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFUtils;

import java.util.ArrayList;
import java.util.Arrays;
Expand Down Expand Up @@ -271,7 +272,7 @@ private final static double[] parseDeprecatedGLString(String GLString) {
if (strings[i].equals(VCFConstants.MISSING_VALUE_v4)) {
missing++;
} else {
likelihoodsAsVector[i] = Double.parseDouble(strings[i]);
likelihoodsAsVector[i] = VCFUtils.parseVcfDouble(strings[i]);
}
}
if (missing == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,7 @@
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.*;

import java.io.Serializable;
import java.util.*;
Expand Down Expand Up @@ -1629,7 +1625,7 @@ private final Object decodeOne(final String field, final String string, final VC
return b;
case String: return string;
case Integer: return Integer.valueOf(string);
case Float: return Double.valueOf(string);
case Float: return VCFUtils.parseVcfDouble(string);
default: throw new TribbleException("Unexpected type for field" + field);
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ protected static Double parseQual(String qualString) {
if ( qualString.equals(VCFConstants.MISSING_VALUE_v4))
return VariantContext.NO_LOG10_PERROR;

Double val = Double.valueOf(qualString);
Double val = VCFUtils.parseVcfDouble(qualString);

// check to see if they encoded the missing qual score in VCF 3 style, with either the -1 or -1.0. check for val < 0 to save some CPU cycles
if ((val < 0) && (Math.abs(val - VCFConstants.MISSING_QUALITY_v3_DOUBLE) < VCFConstants.VCF_ENCODING_EPSILON))
Expand Down Expand Up @@ -804,7 +804,7 @@ public LazyGenotypesContext.LazyData createGenotypeMap(final String str,
if ( genotypeValues.get(i).equals(VCFConstants.MISSING_GENOTYPE_QUALITY_v3) )
gb.noGQ();
else
gb.GQ((int)Math.round(Double.valueOf(genotypeValues.get(i))));
gb.GQ((int)Math.round(VCFUtils.parseVcfDouble(genotypeValues.get(i))));
} else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) {
gb.AD(decodeInts(genotypeValues.get(i)));
} else if (gtKey.equals(VCFConstants.GENOTYPE_PL_KEY)) {
Expand Down
29 changes: 29 additions & 0 deletions src/main/java/htsjdk/variant/vcf/VCFUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,14 @@
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

public class VCFUtils {

private static final Pattern INF_OR_NAN_PATTERN = Pattern.compile("^(?<sign>[-+]?)((?<inf>(INF|INFINITY))|(?<nan>NAN))$", Pattern.CASE_INSENSITIVE);

public static Set<VCFHeaderLine> smartMergeHeaders(final Collection<VCFHeader> headers, final boolean emitWarnings) throws IllegalStateException {
// We need to maintain the order of the VCFHeaderLines, otherwise they will be scrambled in the returned Set.
// This will cause problems for VCFHeader.getSequenceDictionary and anything else that implicitly relies on the line ordering.
Expand Down Expand Up @@ -250,6 +254,31 @@ else if (vcfFile.getAbsolutePath().endsWith(IOUtil.COMPRESSED_VCF_FILE_EXTENSION
return output;
}

/**
* Parses a String as a Double, being tolerant for case-insensitive NaN and Inf/Infinity.
*/
public static double parseVcfDouble(final String str) {
try {
return Double.parseDouble(str);
} catch (NumberFormatException e) {
final Matcher matcher = INF_OR_NAN_PATTERN.matcher(str);
if (matcher.matches()) {
final double ret;
if (matcher.group("inf") == null) {
ret = Double.NaN;
} else {
if (matcher.group("sign").equals("-")) {
ret = Double.NEGATIVE_INFINITY;
} else {
ret = Double.POSITIVE_INFINITY;
}
}
return ret;
}
throw e;
}
}

private static String getReferenceAssembly(final String refPath) {
// This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot
String assembly = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,4 +347,10 @@ public void testGetAllelesUnitialized() {
GenotypeLikelihoods.anyploidPloidyToPLIndexToAlleleIndices.clear();
final List<Integer> alleles = GenotypeLikelihoods.getAlleles(0, 3);
}

@Test
public void testFromCaseInsensitiveString() {
GenotypeLikelihoods gl = GenotypeLikelihoods.fromGLField("nan,Infinity,-inf");
assertDoubleArraysAreEqual(gl.getAsVector(), new double[]{Double.NaN, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -1583,6 +1583,7 @@ public void testGetAttributeAsIntList() {
.attribute("IntegerList", new int[]{0, 1, 2, 3})
.attribute("DoubleList", new double[]{1.8, 1.6, 2.1})
.attribute("StringList", new String[]{"1", "2"})
.attribute("CaseInsensitiveStringList", new String[]{"nan", "Infinity", "-inf"})
.attribute("NotNumeric", new String[]{"A", "B"})
.make();
// test an empty value
Expand All @@ -1593,6 +1594,7 @@ public void testGetAttributeAsIntList() {
Assert.assertEquals(context.getAttributeAsIntList("IntegerList", 5), Arrays.asList(0, 1, 2, 3));
Assert.assertEquals(context.getAttributeAsIntList("DoubleList", 5), Arrays.asList(1, 1, 2));
Assert.assertEquals(context.getAttributeAsIntList("StringList", 5), Arrays.asList(1, 2));
Assert.assertThrows(() -> context.getAttributeAsIntList("CaseInsensitiveStringList", 5));
Assert.assertThrows(() -> context.getAttributeAsIntList("NotNumeric", 5));
// test the case of a missing key
Assert.assertTrue(context.getAttributeAsIntList("MissingList", 5).isEmpty());
Expand All @@ -1607,6 +1609,7 @@ public void testGetAttributeAsDoubleList() {
.attribute("IntegerList", new int[]{0, 1, 2, 3})
.attribute("DoubleList", new double[]{1.8, 1.6, 2.1})
.attribute("StringList", new String[]{"1", "2"})
.attribute("CaseInsensitiveStringList", new String[]{"nan", "Infinity", "-inf"})
.attribute("NotNumeric", new String[]{"A", "B"})
.make();
// test an empty value
Expand All @@ -1617,6 +1620,7 @@ public void testGetAttributeAsDoubleList() {
Assert.assertEquals(context.getAttributeAsDoubleList("IntegerList", 5), Arrays.asList(0d, 1d, 2d, 3d));
Assert.assertEquals(context.getAttributeAsDoubleList("DoubleList", 5), Arrays.asList(1.8, 1.6, 2.1));
Assert.assertEquals(context.getAttributeAsDoubleList("StringList", 5), Arrays.asList(1d, 2d));
Assert.assertEquals(context.getAttributeAsDoubleList("CaseInsensitiveStringList", 5), Arrays.asList(Double.NaN, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY));
Assert.assertThrows(() -> context.getAttributeAsDoubleList("NotNumeric", 5));
// test the case of a missing key
Assert.assertTrue(context.getAttributeAsDoubleList("MissingList", 5).isEmpty());
Expand All @@ -1631,6 +1635,7 @@ public void testGetAttributeAsStringList() {
.attribute("IntegerList", new int[]{0, 1, 2, 3})
.attribute("DoubleList", new double[]{1.8, 1.6, 2.1})
.attribute("StringList", new String[]{"1", "2"})
.attribute("CaseInsensitiveStringList", new String[]{"nan", "Infinity", "-inf"})
.attribute("NotNumeric", new String[]{"A", "B"})
.make();
// test an empty value
Expand All @@ -1641,6 +1646,7 @@ public void testGetAttributeAsStringList() {
Assert.assertEquals(context.getAttributeAsStringList("IntegerList", "empty"), Arrays.asList("0", "1", "2", "3"));
Assert.assertEquals(context.getAttributeAsStringList("DoubleList", "empty"), Arrays.asList("1.8", "1.6", "2.1"));
Assert.assertEquals(context.getAttributeAsStringList("StringList", "empty"), Arrays.asList("1", "2"));
Assert.assertEquals(context.getAttributeAsStringList("CaseInsensitiveStringList", "empty"), Arrays.asList("nan", "Infinity", "-inf"));
Assert.assertEquals(context.getAttributeAsStringList("NotNumeric", "empty"), Arrays.asList("A", "B"));
// test the case of a missing key
Assert.assertTrue(context.getAttributeAsStringList("MissingList", "empty").isEmpty());
Expand Down
27 changes: 27 additions & 0 deletions src/test/java/htsjdk/variant/vcf/AbstractVCFCodecTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.testng.annotations.Test;

import java.io.File;
import java.util.Iterator;
import java.util.List;

public class AbstractVCFCodecTest extends VariantBaseTest {
Expand Down Expand Up @@ -66,4 +67,30 @@ public void testGLnotOverridePL() {
}
Assert.assertEquals(variant.getGenotype(0).getPL(), new int[]{45, 0, 50});
}

@DataProvider(name = "caseIntolerantDoubles")
public Object[][] getCaseIntolerantDoubles() {
return new Object[][]{
{"src/test/resources/htsjdk/variant/test_withNanQual.vcf", Double.NaN},
{"src/test/resources/htsjdk/variant/test_withPosInfQual.vcf", Double.POSITIVE_INFINITY},
{"src/test/resources/htsjdk/variant/test_withNegInfQual.vcf", Double.NEGATIVE_INFINITY},
};
}

@Test(dataProvider = "caseIntolerantDoubles")
public void testCaseIntolerantDoubles(String vcfInput, double value) {
try (final VCFFileReader reader = new VCFFileReader(new File(vcfInput), false)) {
try {
Iterator<VariantContext> iterator = reader.iterator();
final VariantContext baseVariant = iterator.next(); // First row uses Java-style
Assert.assertEquals(baseVariant.getPhredScaledQual(), value);
iterator.forEachRemaining(v -> {
Assert.assertEquals(baseVariant.getPhredScaledQual(), v.getPhredScaledQual());
Assert.assertEquals(baseVariant.getGenotype(0).getGQ(), v.getGenotype(0).getGQ());
});
} catch (TribbleException e) {
Assert.assertEquals(value, Double.NEGATIVE_INFINITY); // QUAL cannot be negative
}
}
}
}
22 changes: 22 additions & 0 deletions src/test/java/htsjdk/variant/vcf/VCFUtilsTest.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package htsjdk.variant.vcf;

import htsjdk.HtsjdkTest;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

Expand Down Expand Up @@ -51,4 +52,25 @@ public void testInvalidHeaderVersionMerger(final List<String> headerVersions) {
VCFUtils.smartMergeHeaders(headersToMerge, true);
}

@DataProvider(name = "caseIntolerantDoubles")
public Object[][] getCaseIntolerantDoubles() {
return new Object[][]{
{Double.NaN, Arrays.asList("NaN", "nan", "+nan", "-nan")},
{Double.POSITIVE_INFINITY, Arrays.asList("+Infinity", "+infinity", "+Inf", "+inf", "Infinity", "infinity", "Inf", "inf")},
{Double.NEGATIVE_INFINITY, Arrays.asList("-Infinity", "-infinity", "-Inf", "-inf")},
{null, Arrays.asList("znan", "nanz", "zinf", "infz", "hello")},
};
}

@Test(dataProvider = "caseIntolerantDoubles")
public void testCaseIntolerantDoubles(Double value, final List<String> stringDoubles) {
stringDoubles.forEach(sd -> {
try {
Assert.assertEquals(VCFUtils.parseVcfDouble(sd), value);
} catch (NumberFormatException e) {
Assert.assertNull(value);
}
});
}

}
8 changes: 8 additions & 0 deletions src/test/resources/htsjdk/variant/test_withNanQual.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##fileformat=VCFv4.1
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##contig=<ID=chr1,length=14640000>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chr1 1 . C T NaN . . GT:GQ ./.:NaN
chr1 2 . C T nan . . GT:GQ ./.:nan
chr1 3 . C T -nan . . GT:GQ ./.:-nan
chr1 4 . C T +nan . . GT:GQ ./.:+nan
8 changes: 8 additions & 0 deletions src/test/resources/htsjdk/variant/test_withNegInfQual.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##fileformat=VCFv4.1
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##contig=<ID=chr1,length=14640000>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chr1 1 . C T -Infinity . . GT:GQ ./.:-Infinity
chr1 2 . C T -infinity . . GT:GQ ./.:-infinity
chr1 3 . C T -Inf . . GT:GQ ./.:-Inf
chr1 4 . C T -inf . . GT:GQ ./.:-inf
12 changes: 12 additions & 0 deletions src/test/resources/htsjdk/variant/test_withPosInfQual.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##fileformat=VCFv4.1
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##contig=<ID=chr1,length=14640000>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
chr1 1 . C T +Infinity . . GT:GQ ./.:+Infinity
chr1 2 . C T +infinity . . GT:GQ ./.:+infinity
chr1 3 . C T +Inf . . GT:GQ ./.:+Inf
chr1 4 . C T +inf . . GT:GQ ./.:+inf
chr1 5 . C T Infinity . . GT:GQ ./.:Infinity
chr1 6 . C T infinity . . GT:GQ ./.:infinity
chr1 7 . C T Inf . . GT:GQ ./.:Inf
chr1 8 . C T inf . . GT:GQ ./.:inf

0 comments on commit 37ea940

Please sign in to comment.