Skip to content

Commit

Permalink
Parse valid VCF 4.2 with ##INFO containing Source + Version (#1248)
Browse files Browse the repository at this point in the history
  • Loading branch information
romanzenka authored and cmnbroad committed Jan 22, 2019
1 parent 5217fe4 commit 93250d5
Show file tree
Hide file tree
Showing 14 changed files with 419 additions and 34 deletions.
4 changes: 2 additions & 2 deletions src/main/java/htsjdk/variant/vcf/AbstractVCFCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
Expand Down Expand Up @@ -212,7 +212,7 @@ protected VCFHeader parseHeaderFromLines( final List<String> headerStrings, fina
final VCFContigHeaderLine contig = new VCFContigHeaderLine(str.substring(9), version, VCFConstants.CONTIG_HEADER_START.substring(2), contigCounter++);
metaData.add(contig);
} else if ( str.startsWith(VCFConstants.ALT_HEADER_START) ) {
final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"));
final VCFSimpleHeaderLine alt = new VCFSimpleHeaderLine(str.substring(6), version, VCFConstants.ALT_HEADER_START.substring(2), Arrays.asList("ID", "Description"), Collections.emptyList());
metaData.add(alt);
} else {
int equals = str.indexOf('=');
Expand Down
98 changes: 91 additions & 7 deletions src/main/java/htsjdk/variant/vcf/VCFCompoundHeaderLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

/**
Expand All @@ -55,6 +57,8 @@ public enum SupportedHeaderLineType {
private VCFHeaderLineCount countType;
private String description;
private VCFHeaderLineType type;
private String source;
private String version;

// access methods
@Override
Expand All @@ -69,6 +73,14 @@ public int getCount() {
return count;
}

public String getSource() {
return source;
}

public String getVersion() {
return version;
}

/**
* Get the number of values expected for this header field, given the properties of VariantContext vc
*
Expand Down Expand Up @@ -119,13 +131,43 @@ public void setNumberToUnbounded() {
* @param lineType the header line type
*/
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
this(name, count, type, description, lineType, null, null);
}

/**
* create a VCF format header line
*
* @param name the name for this header line
* @param count the count type for this header line
* @param type the type for this header line
* @param description the description for this header line
* @param lineType the header line type
*/
protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
this(name, count, type, description, lineType, null, null);
}

/**
* create a VCF format header line
*
* @param name the name for this header line
* @param count the count for this header line
* @param type the type for this header line
* @param description the description for this header line
* @param lineType the header line type
* @param source annotation source (case-insensitive, e.g. "dbsnp")
* @param version exact version (e.g. "138")
*/
protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) {
super(lineType.toString(), "");
this.name = name;
this.countType = VCFHeaderLineCount.INTEGER;
this.count = count;
this.type = type;
this.description = description;
this.lineType = lineType;
this.source = source;
this.version = version;
validate();
}

Expand All @@ -137,14 +179,18 @@ protected VCFCompoundHeaderLine(String name, int count, VCFHeaderLineType type,
* @param type the type for this header line
* @param description the description for this header line
* @param lineType the header line type
* @param source annotation source (case-insensitive, e.g. "dbsnp")
* @param version exact version (e.g. "138")
*/
protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType) {
protected VCFCompoundHeaderLine(String name, VCFHeaderLineCount count, VCFHeaderLineType type, String description, SupportedHeaderLineType lineType, String source, String version) {
super(lineType.toString(), "");
this.name = name;
this.countType = count;
this.type = type;
this.description = description;
this.lineType = lineType;
this.source = source;
this.version = version;
validate();
}

Expand All @@ -160,9 +206,13 @@ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, Supported
super(lineType.toString(), "");

final ArrayList<String> expectedTags = new ArrayList(Arrays.asList("ID", "Number", "Type", "Description"));
if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2))
expectedTags.add("Version");
final Map<String, String> mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTags);
final List<String> recommendedTags;
if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) {
recommendedTags = Arrays.asList("Source", "Version");
} else {
recommendedTags = Collections.emptyList();
}
final Map<String, String> mapping = VCFHeaderLineTranslator.parseLine(version, line, expectedTags, recommendedTags);
name = mapping.get("ID");
count = -1;
final String numberStr = mapping.get("Number");
Expand All @@ -173,7 +223,7 @@ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, Supported
} else if (numberStr.equals(VCFConstants.PER_GENOTYPE_COUNT)) {
countType = VCFHeaderLineCount.G;
} else if ((version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v4)) ||
(!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) {
(!version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_0) && numberStr.equals(VCFConstants.UNBOUNDED_ENCODING_v3))) {
countType = VCFHeaderLineCount.UNBOUNDED;
} else {
countType = VCFHeaderLineCount.INTEGER;
Expand All @@ -198,16 +248,21 @@ protected VCFCompoundHeaderLine(String line, VCFHeaderVersion version, Supported

this.lineType = lineType;

if (version.isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_2)) {
this.source = mapping.get("Source");
this.version = mapping.get("Version");
}

validate();
}

private void validate() {
if (type != VCFHeaderLineType.Flag && countType == VCFHeaderLineCount.INTEGER && count <= 0)
throw new IllegalArgumentException(String.format("Invalid count number, with fixed count the number should be 1 or higher: key=%s name=%s type=%s desc=%s lineType=%s count=%s",
getKey(), name, type, description, lineType, count));
getKey(), name, type, description, lineType, count));
if (name == null || type == null || description == null || lineType == null)
throw new IllegalArgumentException(String.format("Invalid VCFCompoundHeaderLine: key=%s name=%s type=%s desc=%s lineType=%s",
getKey(), name, type, description, lineType));
getKey(), name, type, description, lineType));
if (name.contains("<") || name.contains(">"))
throw new IllegalArgumentException("VCFHeaderLine: ID cannot contain angle brackets");
if (name.contains("="))
Expand Down Expand Up @@ -250,6 +305,12 @@ protected String toStringEncoding() {
map.put("Number", number);
map.put("Type", type);
map.put("Description", description);
if (source != null) {
map.put("Source", source);
}
if (version != null) {
map.put("Version", version);
}
return lineType.toString() + "=" + VCFHeaderLine.toStringEncoding(map);
}

Expand Down Expand Up @@ -281,6 +342,8 @@ public int hashCode() {
result = 31 * result + description.hashCode();
result = 31 * result + type.hashCode();
result = 31 * result + lineType.hashCode();
result = 31 * result + (source != null ? source.hashCode() : 0);
result = 31 * result + (version != null ? version.hashCode() : 0);
return result;
}

Expand All @@ -303,4 +366,25 @@ public boolean sameLineTypeAndName(VCFCompoundHeaderLine other) {
*/
abstract boolean allowFlagValues();

/**
* Specify annotation source
* <p>
* This value is optional starting with VCFv4.2.
*
* @param source annotation source (case-insensitive, e.g. "dbsnp")
*/
public void setSource(final String source) {
this.source = source;
}

/**
* Specify annotation version
* <p>
* This value is optional starting with VCFv4.2.
*
* @param version exact version (e.g. "138")
*/
public void setVersion(final String version) {
this.version = version;
}
}
5 changes: 3 additions & 2 deletions src/main/java/htsjdk/variant/vcf/VCFContigHeaderLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.tribble.TribbleException;

import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;

Expand All @@ -49,7 +50,7 @@ public class VCFContigHeaderLine extends VCFSimpleHeaderLine {
* @param key the key for this header line
*/
public VCFContigHeaderLine(final String line, final VCFHeaderVersion version, final String key, final int contigIndex) {
super(line, version, key, null);
super(line, version, key, null, Collections.emptyList());
if (contigIndex < 0) throw new TribbleException("The contig index is less than zero.");
this.contigIndex = contigIndex;
}
Expand Down Expand Up @@ -115,4 +116,4 @@ public int compareTo(final Object other) {
return super.compareTo(other);
}
}
}
}
3 changes: 2 additions & 1 deletion src/main/java/htsjdk/variant/vcf/VCFFilterHeaderLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
package htsjdk.variant.vcf;

import java.util.Arrays;
import java.util.Collections;

/**
* @author ebanks
Expand Down Expand Up @@ -61,7 +62,7 @@ public VCFFilterHeaderLine(final String name) {
* @param version the vcf header version
*/
public VCFFilterHeaderLine(final String line, final VCFHeaderVersion version) {
super(line, version, "FILTER", Arrays.asList("ID", "Description"));
super(line, version, "FILTER", Arrays.asList("ID", "Description"), Collections.emptyList());
}

@Override
Expand Down
6 changes: 4 additions & 2 deletions src/main/java/htsjdk/variant/vcf/VCFHeaderLine.java
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,9 @@ public static String toStringEncoding(Map<String, ? extends Object> keyValues) {
builder.append('=');
builder.append(entry.getValue().toString().contains(",") ||
entry.getValue().toString().contains(" ") ||
entry.getKey().equals("Description") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue());
entry.getKey().equals("Description") ||
entry.getKey().equals("Source") || // As per VCFv4.2, Source and Version should be surrounded by double quotes
entry.getKey().equals("Version") ? "\""+ escapeQuotes(entry.getValue().toString()) + "\"" : entry.getValue());
}
builder.append('>');
return builder.toString();
Expand All @@ -172,4 +174,4 @@ private static String escapeQuotes(final String value) {
// with: the thing that wasn't a backslash ($1), followed by a backslash, followed by a double quote
return value.replaceAll("([^\\\\])\"", "$1\\\\\"");
}
}
}
71 changes: 57 additions & 14 deletions src/main/java/htsjdk/variant/vcf/VCFHeaderLineTranslator.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

import htsjdk.tribble.TribbleException;

import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
Expand All @@ -48,27 +49,54 @@ public class VCFHeaderLineTranslator {
}

public static Map<String,String> parseLine(VCFHeaderVersion version, String valueLine, List<String> expectedTagOrder) {
return mapping.get(version).parseLine(valueLine,expectedTagOrder);
return parseLine(version, valueLine, expectedTagOrder, Collections.emptyList());
}

public static Map<String,String> parseLine(VCFHeaderVersion version, String valueLine, List<String> expectedTagOrder, List<String> recommendedTags) {
return mapping.get(version).parseLine(valueLine, expectedTagOrder, recommendedTags);
}
}


interface VCFLineParser {
public Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder);
/**
* parse a VCF line
*
* @see #parseLine(String, List, List) VCFv4.2+ recommended tags support
*
* @param valueLine the line
* @param expectedTagOrder List of expected tags
* @return a mapping of the tags parsed out
*/
default Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder) {
return parseLine(valueLine, expectedTagOrder, Collections.emptyList());
}

/**
* parse a VCF line
*
* The recommended tags were introduced in VCFv4.2.
* Older implementations may throw an exception when the recommendedTags field is not empty.
*
* We use a list to represent tags as we assume there will be a very small amount of them,
* so using a {@code Set} is overhead.
*
* @param valueLine the line
* @param expectedTagOrder List of expected tags
* @param recommendedTags List of tags that may or may not be present. Use an empty list instead of NULL for none.
* @return a mapping of the tags parsed out
*/
Map<String,String> parseLine(String valueLine, List<String> expectedTagOrder, List<String> recommendedTags);
}


/**
* a class that handles the to and from disk for VCF 4 lines
*/
class VCF4Parser implements VCFLineParser {
/**
* parse a VCF4 line
* @param valueLine the line
* @return a mapping of the tags parsed out
*/

@Override
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder, List<String> recommendedTags) {
// our return map
Map<String, String> ret = new LinkedHashMap<String, String>();

Expand Down Expand Up @@ -132,11 +160,22 @@ public Map<String, String> parseLine(String valueLine, List<String> expectedTagO
// validate the tags against the expected list
index = 0;
if ( expectedTagOrder != null ) {
if ( ret.size() > expectedTagOrder.size() )
throw new TribbleException.InvalidHeader("unexpected tag count " + ret.size() + " in line " + valueLine);
if (ret.keySet().isEmpty() && !expectedTagOrder.isEmpty()) {
throw new TribbleException.InvalidHeader("Header with no tags is not supported when there are expected tags in line " + valueLine);
}
for ( String str : ret.keySet() ) {
if ( !expectedTagOrder.get(index).equals(str) )
throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine);
if (index < expectedTagOrder.size()) {
if (!expectedTagOrder.get(index).equals(str)) {
if (expectedTagOrder.contains(str)) {
throw new TribbleException.InvalidHeader("Tag " + str + " in wrong order (was #" + (index+1) + ", expected #" + (expectedTagOrder.indexOf(str)+1) + ") in line " + valueLine);
} else if (recommendedTags.contains(str)) {
throw new TribbleException.InvalidHeader("Recommended tag " + str + " must be listed after all expected tags in line " + valueLine);
}
else {
throw new TribbleException.InvalidHeader("Unexpected tag " + str + " in line " + valueLine);
}
}
}
index++;
}
}
Expand All @@ -147,7 +186,11 @@ public Map<String, String> parseLine(String valueLine, List<String> expectedTagO
class VCF3Parser implements VCFLineParser {

@Override
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder) {
public Map<String, String> parseLine(String valueLine, List<String> expectedTagOrder, List<String> recommendedTags) {
if (!recommendedTags.isEmpty()) {
throw new TribbleException.InternalCodecException("Recommended tags are not allowed in VCFv3.x");
}

// our return map
Map<String, String> ret = new LinkedHashMap<String, String>();

Expand Down Expand Up @@ -182,4 +225,4 @@ public Map<String, String> parseLine(String valueLine, List<String> expectedTagO
}
return ret;
}
}
}
Loading

0 comments on commit 93250d5

Please sign in to comment.