diff --git a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java index dcb4352c95..5a7529f3aa 100644 --- a/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java +++ b/src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java @@ -42,7 +42,6 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; @@ -50,6 +49,7 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -212,7 +212,7 @@ protected VCFHeader parseHeaderFromLines( final List headerStrings, fina final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++); metaData.add(contig); } else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) { - final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description")); + final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"), Collections.emptyList()); metaData.add(alt); } else { int equals = str.indexOf('='); diff --git a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java index 481ca924a2..a6418b0a55 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java @@ -32,7 +32,9 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.LinkedHashMap; +import java.util.List; import java.util.Map; /** @@ -55,6 +57,8 @@ public enum SupportedHeaderLineType { private VCFHeaderLineCount countType; private String description; private VCFHeaderLineType type; + private String source; + private String version; // access methods @Override @@ -69,6 +73,14 @@ public int getCount() { return count; } + public String getSource() { + return source; + } + + public String getVersion() { + return version; + } + /** * Get the number of values expected for this header field, given the properties of VariantContext vc * @@ -119,6 +131,34 @@ public void setNumberToUnbounded() { * @param lineType the header line type */ protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { + this(name, count, type, description, lineType, null, null); + } + + /** + * create a VCF format header line + * + * @param name the name for this header line + * @param count the count type for this header line + * @param type the type for this header line + * @param description the description for this header line + * @param lineType the header line type + */ + protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { + this(name, count, type, description, lineType, null, null); + } + + /** + * create a VCF format header line + * + * @param name the name for this header line + * @param count the count for this header line + * @param type the type for this header line + * @param description the description for this header line + * @param lineType the header line type + * @param source annotation source (case-insensitive, e.g. "dbsnp") + * @param version exact version (e.g. "138") + */ + protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) { super(lineType.toString(), ""); this.name = name; this.countType = VCFHeaderLineCount.INTEGER; @@ -126,6 +166,8 @@ protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, this.type = type; this.description = description; this.lineType = lineType; + this.source = source; + this.version = version; validate(); } @@ -137,14 +179,18 @@ protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, * @param type the type for this header line * @param description the description for this header line * @param lineType the header line type + * @param source annotation source (case-insensitive, e.g. "dbsnp") + * @param version exact version (e.g. "138") */ - protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) { + protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) { super(lineType.toString(), ""); this.name = name; this.countType = count; this.type = type; this.description = description; this.lineType = lineType; + this.source = source; + this.version = version; validate(); } @@ -160,9 +206,13 @@ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, Supported super(lineType.toString(), ""); final ArrayList expectedTags = new ArrayList(Arrays.asList("ID", "Number", "Type", "Description")); - if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) - expectedTags.add("Version"); - final Map mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTags); + final List recommendedTags; + if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { + recommendedTags = Arrays.asList("Source", "Version"); + } else { + recommendedTags = Collections.emptyList(); + } + final Map mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTags, recommendedTags); name = mapping.get("ID"); count = -1; final String numberStr = mapping.get("Number"); @@ -173,7 +223,7 @@ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, Supported } else if (numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT)) { countType = VCFHeaderLineCount.G; } else if ((version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) || - (!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { + (!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) { countType = VCFHeaderLineCount.UNBOUNDED; } else { countType = VCFHeaderLineCount.INTEGER; @@ -198,16 +248,21 @@ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, Supported this.lineType = lineType; + if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) { + this.source = mapping.get("Source"); + this.version = mapping.get("Version"); + } + validate(); } private void validate() { if (type != VCFHeaderLineType.Flag && countType == VCFHeaderLineCount.INTEGER && count <= 0) throw new IllegalArgumentException(String.format("Invalid count number, with fixed count the number should be 1 or higher: key=%s name=%s type=%s desc=%s lineType=%s count=%s", - getKey(), name, type, description, lineType, count)); + getKey(), name, type, description, lineType, count)); if (name == null || type == null || description == null || lineType == null) throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s", - getKey(), name, type, description, lineType)); + getKey(), name, type, description, lineType)); if (name.contains("<") || name.contains(">")) throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets"); if (name.contains("=")) @@ -250,6 +305,12 @@ protected String toStringEncoding() { map.put("Number", number); map.put("Type", type); map.put("Description", description); + if (source != null) { + map.put("Source", source); + } + if (version != null) { + map.put("Version", version); + } return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map); } @@ -281,6 +342,8 @@ public int hashCode() { result = 31 * result + description.hashCode(); result = 31 * result + type.hashCode(); result = 31 * result + lineType.hashCode(); + result = 31 * result + (source != null ? source.hashCode() : 0); + result = 31 * result + (version != null ? version.hashCode() : 0); return result; } @@ -303,4 +366,25 @@ public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) { */ abstract boolean allowFlagValues(); + /** + * Specify annotation source + *

+ * This value is optional starting with VCFv4.2. + * + * @param source annotation source (case-insensitive, e.g. "dbsnp") + */ + public void setSource(final String source) { + this.source = source; + } + + /** + * Specify annotation version + *

+ * This value is optional starting with VCFv4.2. + * + * @param version exact version (e.g. "138") + */ + public void setVersion(final String version) { + this.version = version; + } } diff --git a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java index 12e400c95c..ea71470a53 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java @@ -28,6 +28,7 @@ import htsjdk.samtools.SAMSequenceRecord; import htsjdk.tribble.TribbleException; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; @@ -49,7 +50,7 @@ public class VCFContigHeaderLine extends VCFSimpleHeaderLine { * @param key the key for this header line */ public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, final int contigIndex) { - super(line, version, key, null); + super(line, version, key, null, Collections.emptyList()); if (contigIndex < 0) throw new TribbleException("The contig index is less than zero."); this.contigIndex = contigIndex; } @@ -115,4 +116,4 @@ public int compareTo(final Object other) { return super.compareTo(other); } } -} \ No newline at end of file +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java index 5130963acf..6ca8f3f532 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java @@ -26,6 +26,7 @@ package htsjdk.variant.vcf; import java.util.Arrays; +import java.util.Collections; /** * @author ebanks @@ -61,7 +62,7 @@ public VCFFilterHeaderLine(final String name) { * @param version the vcf header version */ public VCFFilterHeaderLine(final String line, final VCFHeaderVersion version) { - super(line, version, "FILTER", Arrays.asList("ID", "Description")); + super(line, version, "FILTER", Arrays.asList("ID", "Description"), Collections.emptyList()); } @Override diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java index ce12c42730..0d07a83078 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java @@ -159,7 +159,9 @@ public static String toStringEncoding(Map keyValues) { builder.append('='); builder.append(entry.getValue().toString().contains(",") || entry.getValue().toString().contains(" ") || - entry.getKey().equals("Description") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue()); + entry.getKey().equals("Description") || + entry.getKey().equals("Source") || // As per VCFv4.2, Source and Version should be surrounded by double quotes + entry.getKey().equals("Version") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue()); } builder.append('>'); return builder.toString(); @@ -172,4 +174,4 @@ private static String escapeQuotes(final String value) { // with: the thing that wasn't a backslash ($1), followed by a backslash, followed by a double quote return value.replaceAll("([^\\\\])\"", "$1\\\\\""); } -} \ No newline at end of file +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java index 3ac72b28c6..54213a67ac 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java +++ b/src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java @@ -27,6 +27,7 @@ import htsjdk.tribble.TribbleException; +import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; @@ -48,13 +49,44 @@ public class VCFHeaderLineTranslator { } public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder) { - return mapping.get(version).parseLine(valueLine,expectedTagOrder); + return parseLine(version, valueLine, expectedTagOrder, Collections.emptyList()); + } + + public static Map parseLine(VCFHeaderVersion version, String valueLine, List expectedTagOrder, List recommendedTags) { + return mapping.get(version).parseLine(valueLine, expectedTagOrder, recommendedTags); } } interface VCFLineParser { - public Map parseLine(String valueLine, List expectedTagOrder); + /** + * parse a VCF line + * + * @see #parseLine(String, List, List) VCFv4.2+ recommended tags support + * + * @param valueLine the line + * @param expectedTagOrder List of expected tags + * @return a mapping of the tags parsed out + */ + default Map parseLine(String valueLine, List expectedTagOrder) { + return parseLine(valueLine, expectedTagOrder, Collections.emptyList()); + } + + /** + * parse a VCF line + * + * The recommended tags were introduced in VCFv4.2. + * Older implementations may throw an exception when the recommendedTags field is not empty. + * + * We use a list to represent tags as we assume there will be a very small amount of them, + * so using a {@code Set} is overhead. + * + * @param valueLine the line + * @param expectedTagOrder List of expected tags + * @param recommendedTags List of tags that may or may not be present. Use an empty list instead of NULL for none. + * @return a mapping of the tags parsed out + */ + Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags); } @@ -62,13 +94,9 @@ interface VCFLineParser { * a class that handles the to and from disk for VCF 4 lines */ class VCF4Parser implements VCFLineParser { - /** - * parse a VCF4 line - * @param valueLine the line - * @return a mapping of the tags parsed out - */ + @Override - public Map parseLine(String valueLine, List expectedTagOrder) { + public Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags) { // our return map Map ret = new LinkedHashMap(); @@ -132,11 +160,22 @@ public Map parseLine(String valueLine, List expectedTagO // validate the tags against the expected list index = 0; if ( expectedTagOrder != null ) { - if ( ret.size() > expectedTagOrder.size() ) - throw new TribbleException.InvalidHeader("unexpected tag count " + ret.size() + " in line " + valueLine); + if (ret.keySet().isEmpty() && !expectedTagOrder.isEmpty()) { + throw new TribbleException.InvalidHeader("Header with no tags is not supported when there are expected tags in line " + valueLine); + } for ( String str : ret.keySet() ) { - if ( !expectedTagOrder.get(index).equals(str) ) - throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine); + if (index < expectedTagOrder.size()) { + if (!expectedTagOrder.get(index).equals(str)) { + if (expectedTagOrder.contains(str)) { + throw new TribbleException.InvalidHeader("Tag " + str + " in wrong order (was #" + (index+1) + ", expected #" + (expectedTagOrder.indexOf(str)+1) + ") in line " + valueLine); + } else if (recommendedTags.contains(str)) { + throw new TribbleException.InvalidHeader("Recommended tag " + str + " must be listed after all expected tags in line " + valueLine); + } + else { + throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine); + } + } + } index++; } } @@ -147,7 +186,11 @@ public Map parseLine(String valueLine, List expectedTagO class VCF3Parser implements VCFLineParser { @Override - public Map parseLine(String valueLine, List expectedTagOrder) { + public Map parseLine(String valueLine, List expectedTagOrder, List recommendedTags) { + if (!recommendedTags.isEmpty()) { + throw new TribbleException.InternalCodecException("Recommended tags are not allowed in VCFv3.x"); + } + // our return map Map ret = new LinkedHashMap(); @@ -182,4 +225,4 @@ public Map parseLine(String valueLine, List expectedTagO } return ret; } -} \ No newline at end of file +} diff --git a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java index afa1f8141a..13df34bc87 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFInfoHeaderLine.java @@ -44,6 +44,14 @@ public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineTyp super(name, count, type, description, SupportedHeaderLineType.INFO); } + public VCFInfoHeaderLine(String name, int count, VCFHeaderLineType type, String description, String source, String version) { + super(name, count, type, description, SupportedHeaderLineType.INFO, source, version); + } + + public VCFInfoHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, String source, String version) { + super(name, count, type, description, SupportedHeaderLineType.INFO, source, version); + } + public VCFInfoHeaderLine(String line, VCFHeaderVersion version) { super(line, version, SupportedHeaderLineType.INFO); } diff --git a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java index 5856f06bbc..9082f965cc 100644 --- a/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java +++ b/src/main/java/htsjdk/variant/vcf/VCFSimpleHeaderLine.java @@ -57,6 +57,8 @@ public VCFSimpleHeaderLine(String key, String name, String description) { /** * create a VCF info header line + * + * @see #VCFSimpleHeaderLine(String, VCFHeaderVersion, String, List, List) VCFv4.2+ recommended tags support * * @param line the header line * @param version the vcf header version @@ -64,7 +66,20 @@ public VCFSimpleHeaderLine(String key, String name, String description) { * @param expectedTagOrdering the tag ordering expected for this header line */ public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering) { - this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering)); + this(line, version, key, expectedTagOrdering, Collections.emptyList()); + } + + /** + * create a VCF info header line + * + * @param line the header line + * @param version the vcf header version + * @param key the key for this header line + * @param expectedTagOrdering the tag ordering expected for this header line + * @param recommendedTags tags that are optional for this header line + */ + public VCFSimpleHeaderLine(final String line, final VCFHeaderVersion version, final String key, final List expectedTagOrdering, final List recommendedTags) { + this(key, VCFHeaderLineTranslator.parseLine(version, line, expectedTagOrdering, recommendedTags)); } public VCFSimpleHeaderLine(final String key, final Map mapping) { diff --git a/src/test/java/htsjdk/samtools/util/Md5CalculatingOutputStreamTest.java b/src/test/java/htsjdk/samtools/util/Md5CalculatingOutputStreamTest.java new file mode 100644 index 0000000000..4a5525ca15 --- /dev/null +++ b/src/test/java/htsjdk/samtools/util/Md5CalculatingOutputStreamTest.java @@ -0,0 +1,56 @@ +/* + * The MIT License + * + * Copyright (c) 2019 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package htsjdk.samtools.util; + +import com.google.common.base.Charsets; +import htsjdk.HtsjdkTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; + +public class Md5CalculatingOutputStreamTest extends HtsjdkTest { + @DataProvider(name = "fileContents") + public Object[][] createDBQTestData() { + return new Object[][]{ + {"", "d41d8cd98f00b204e9800998ecf8427e"}, // No zeroes at the start + {"a", "0cc175b9c0f1b6a831c399e269772661"}, // One zero + {"jk8ssl", "0000000018e6137ac2caab16074784a6"}, // Many zeroes, thanks @delta14 at StackOverflow + }; + } + + @Test(dataProvider = "fileContents") + public void testMd5(final String contents, final String expectedMd5) throws IOException { + byte[] bytes = contents.getBytes(Charsets.US_ASCII); + OutputStream outputStream = new ByteArrayOutputStream(bytes.length); + Md5CalculatingOutputStream md5 = new Md5CalculatingOutputStream(outputStream, (File) null); + md5.write(bytes); + md5.close(); // Cannot use try-with-resources because we need a value after closing the stream + Assert.assertEquals(md5.md5(), expectedMd5); + } +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java index 637068dad5..5a8c236c8b 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFFileReaderTest.java @@ -69,6 +69,8 @@ Object[][] pathsData() { {TEST_DATA_DIR + "VcfThatLacksAnIndex.bcf", null, true, false}, {TEST_DATA_DIR + "VcfThatLacksAnIndex.vcf.bgz", null, true, false}, + // testing that v4.2 parses Source/Version fields, see issue #517 + {TEST_DATA_DIR + "Vcf4.2WithSourceVersionInfoFields.vcf", null, false, true} }; } @@ -83,4 +85,4 @@ public void testCanOpenVCFPathReader(final String file, final String index, fina } } } -} \ No newline at end of file +} diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java index 88a8cce140..73116f53f0 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderLineTranslatorUnitTest.java @@ -3,8 +3,12 @@ import htsjdk.tribble.TribbleException; import htsjdk.variant.VariantBaseTest; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; import java.util.Map; public class VCFHeaderLineTranslatorUnitTest extends VariantBaseTest { @@ -46,17 +50,116 @@ public void testParseVCF4HeaderLine() { // test with an unclosed quote try { - final Map values6 = VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, "", null); + final Map values6 = VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, "", null); Assert.fail("Should have thrown a TribbleException for having an unclosed quote in the description line"); - } catch (TribbleException.InvalidHeader e) { + } + catch (TribbleException.InvalidHeader e) { } // test with an escaped quote at the end try { - final Map values7 = VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, "", null); + final Map values7 = VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, "", null); Assert.fail("Should have thrown a TribbleException for having an unclosed quote in the description line"); - } catch (TribbleException.InvalidHeader e) { } + catch (TribbleException.InvalidHeader e) { + } + + } + + @DataProvider(name = "validHeaderLines") + private Object[][] getValidHeaderLines() { + List idDesc = Arrays.asList("ID", "Description"); + List none = Collections.emptyList(); + List sourceVersion = Arrays.asList("Source", "Version"); + List extra = Arrays.asList("Extra"); + return new Object[][]{ + // to parse, expected, recommended + {"", idDesc, sourceVersion}, + {"", idDesc, sourceVersion}, + {"", idDesc, sourceVersion}, + {"", idDesc, sourceVersion}, + {"", idDesc, sourceVersion}, + {"", idDesc, sourceVersion}, + + {"", idDesc, none}, + {"", idDesc, none}, + {"", idDesc, none}, + {">", idDesc, none}, + {"", idDesc, none}, + + {"", idDesc, extra}, + {"", idDesc, extra}, + {"<>", none, none}, + {"<>", none, extra}, + {"<>", none, sourceVersion} + }; + } + + @DataProvider(name = "invalidHeaderLines") + private Object[][] getInvalidHeaderLines() { + List idDesc = Arrays.asList("ID", "Description"); + List none = Collections.emptyList(); + List sourceVersion = Arrays.asList("Source", "Version"); + return new Object[][]{ + // to parse, expected, recommended, error message + {"", idDesc, none, "Tag Description in wrong order (was #1, expected #2)"}, + {"", idDesc, none, "Unexpected tag Desc"}, + {"<>", idDesc, none, "Unexpected tag "}, + + {"", idDesc, sourceVersion, "Recommended tag Source must be listed after all expected tags"}, + {"", idDesc, sourceVersion, "Recommended tag Source must be listed after all expected tags"} + }; + } + + private static void callTranslator(final String line, + final List expectedTagOrder, + final List recommendedTags) { + // To cover both constructors for code coverage + if (recommendedTags.isEmpty()) { + VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder); + } + else { + VCFHeaderLineTranslator.parseLine(VCFHeaderVersion.VCF4_2, line, expectedTagOrder, recommendedTags); + } + } + + @Test(dataProvider = "validHeaderLines") + public void testParseVCF4HeaderLineWithTagsValid(final String line, + final List expectedTagOrder, + final List recommendedTags) { + callTranslator(line, expectedTagOrder, recommendedTags); + } + + @Test(dataProvider = "invalidHeaderLines") + public void testParseVCF4HeaderLineWithTagsInvalid(final String line, + final List expectedTagOrder, + final List recommendedTags, + final String error) { + final TribbleException e = Assert.expectThrows( + TribbleException.class, + () -> callTranslator(line, expectedTagOrder, recommendedTags) + ); + Assert.assertTrue( + e.getMessage().contains(error), + String.format("Error string '%s' should be present in error message '%s'", error, e.getMessage()) + ); + } + + @DataProvider(name = "vcfv3") + private Object[][] getVcfV3Versions() { + return new Object[][]{ + {VCFHeaderVersion.VCF3_2}, + {VCFHeaderVersion.VCF3_3} + }; + } + @Test(dataProvider = "vcfv3", expectedExceptions = TribbleException.class) + public void testVcfV3FailsRecommendedTags(final VCFHeaderVersion vcfVersion) { + VCFHeaderLineTranslator.parseLine( + vcfVersion, + "", + Arrays.asList("ID"), + Arrays.asList("Description") + ); } } diff --git a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java index 1a2f41380b..2e534d1197 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFHeaderUnitTest.java @@ -46,6 +46,8 @@ import java.io.*; import java.math.BigInteger; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.*; @@ -503,6 +505,38 @@ public void testVCFHeaderQuoteEscaping() throws Exception { } + @Test + public void testVcf42Roundtrip() throws Exception { + // this test ensures that source/version fields are round-tripped properly + + // read an existing VCF + File expectedFile = new File("src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf"); + + // write the file out into a new copy + final File actualFile = File.createTempFile("testVcf4.2roundtrip.", IOUtil.VCF_FILE_EXTENSION); + actualFile.deleteOnExit(); + + try (final VCFFileReader originalFileReader = new VCFFileReader(expectedFile, false); + final VariantContextWriter copyWriter = new VariantContextWriterBuilder() + .setOutputFile(actualFile) + .setReferenceDictionary(createArtificialSequenceDictionary()) + .setOptions(EnumSet.of(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Options.INDEX_ON_THE_FLY)) + .build() + ) { + final VCFHeader originalHeader = originalFileReader.getFileHeader(); + + copyWriter.writeHeader(originalHeader); + for (final VariantContext variantContext : originalFileReader) { + copyWriter.add(variantContext); + } + } + + final String actualContents = new String(Files.readAllBytes(actualFile.toPath()), StandardCharsets.UTF_8); + final String expectedContents = new String(Files.readAllBytes(expectedFile.toPath()), StandardCharsets.UTF_8); + Assert.assertEquals(actualContents, expectedContents); + } + + /** * a little utility function for all tests to md5sum a file * Shameless taken from: diff --git a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java index f72cd87978..c9efaa59ef 100644 --- a/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java +++ b/src/test/java/htsjdk/variant/vcf/VCFStandardHeaderLinesUnitTest.java @@ -178,6 +178,7 @@ public Object[][] makeRepairHeaderTest() { tests.add(new Object[]{new RepairHeaderTest( new VCFInfoHeaderLine("AC", 1, VCFHeaderLineType.Float, "x"), standardAC)}); tests.add(new Object[]{new RepairHeaderTest( new VCFInfoHeaderLine("AC", 1, VCFHeaderLineType.String, "x"), standardAC)}); tests.add(new Object[]{new RepairHeaderTest( new VCFInfoHeaderLine("AC", 0, VCFHeaderLineType.Flag, "x"), standardAC)}); + tests.add(new Object[]{new RepairHeaderTest( new VCFInfoHeaderLine("AC", 0, VCFHeaderLineType.Flag, "x", "source", "v1.2.3"), standardAC)}); tests.add(new Object[]{new RepairHeaderTest( new VCFInfoHeaderLine("NON_STANDARD_INFO", 1, VCFHeaderLineType.String, "x"))}); tests.add(new Object[]{new RepairHeaderTest( new VCFFormatHeaderLine("NON_STANDARD_FORMAT", 1, VCFHeaderLineType.String, "x"))}); diff --git a/src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf b/src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf new file mode 100644 index 0000000000..d2cd09b9ae --- /dev/null +++ b/src/test/resources/htsjdk/variant/Vcf4.2WithSourceVersionInfoFields.vcf @@ -0,0 +1,35 @@ +##fileformat=VCFv4.2 +##ALT= +##ALT= +##ALT= +##ALT= +##FILTER= 0.06"> +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[/humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-23/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-24/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-5/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-9/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-6/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-19/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-25/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-4/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-14/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-22/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-2/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-3/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-7/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-16/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-1/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-17/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-8/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-10/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-18/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-20/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-11/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-15/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-21/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-12/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam, /humgen/1kg/analysis/bamsForDataProcessingPapers/scriptsToMakeBams/Q-2970@gsa2-1-temp-13/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam] read_buffer_size=null read_filter=[] intervals=[chrX] excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[dbsnp,dbsnp,/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod, interval,Intervals,chrX] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod hapmap=null hapmap_chip=null out=null err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=10000 num_threads=1 interval_merging=ALL read_group_black_list=null genotype_model=JOINT_ESTIMATE base_model=EMPIRICAL heterozygosity=7.8E-4 genotype=false output_all_callable_bases=false standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=10.0 trigger_min_confidence_threshold_for_calling=30.0 trigger_min_confidence_threshold_for_emitting=30.0 noSLOD=false assume_single_sample_reads=null platform=null min_base_quality_score=20 min_mapping_quality_score=20 max_mismatches_in_40bp_window=3 use_reads_with_bad_mates=false max_deletion_fraction=0.05 cap_base_quality_by_mapping_quality=false" +##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null read_filter=[] intervals=null excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[variant,VCF,wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.vcf, mask,Bed,wgs.v9/HiSeq.WGS.cleaned.indels.10.mask] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=null hapmap=null hapmap_chip=null out=wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=2147483647 num_threads=1 interval_merging=ALL read_group_black_list=null filterExpression=[] filterName=[] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=0 maskName=Indel NO_HEADER=false" +##VariantFiltration="analysis_type=VariantFiltration input_file=[] read_buffer_size=null read_filter=[] intervals=null excludeIntervals=[chrM, chrY] reference_sequence=/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta rodBind=[variant,VCF,wgs.v9/HiSeq.WGS.cleaned.ug.vcf] rodToIntervalTrackName=null BTI_merge_rule=UNION DBSNP=null hapmap=null hapmap_chip=null out=wgs.v9/HiSeq.WGS.cleaned.ug.snpfiltered.vcf err=null outerr=null filterZeroMappingQualityReads=false downsampling_type=NONE downsample_to_fraction=null downsample_to_coverage=null useOriginalQualities=false validation_strictness=SILENT unsafe=null max_reads_at_locus=2147483647 num_threads=1 interval_merging=ALL read_group_black_list=null filterExpression=[QUAL < 50.0, MQ0 >= 4 && ((MQ0 / (1.0 * DP)) > 0.1), AB > 0.75 && DP > 40, DP > 120 || SB > -0.10] filterName=[LowQual, HARD_TO_VALIDATE, ABFilter, DPFilter] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=10 maskName=Mask NO_HEADER=false" +##contig= +##source=VariantOptimizer +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +chr1 109 . A T 0 FDRtranche2.00to10.00+ AC=1;AF=0.50;AN=2;DP=1019;Dels=0.00;HRun=0;HaplotypeScore=686.65;MQ=19.20;MQ0=288;OQ=2175.54;QD=2.13;SB=-1042.18 GT:AD:DP:GL:GQ 0/1:610,327:308:-316.30,-95.47,-803.03:99