Skip to content

Commit

Permalink
disallowing bad characters in SamRecord names (#1238)
Browse files Browse the repository at this point in the history
* implementing the spec change in samtools/hts-specs#333
* this disallows a number of characters from reference sequence names, these are characters like ',' which do not appear in standard references and which cause parsing issues if they are allowed
  • Loading branch information
Yossi Farjoun authored and lbergelson committed Dec 10, 2018
1 parent 6ac7a60 commit 8cc1e37
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 17 deletions.
29 changes: 16 additions & 13 deletions src/main/java/htsjdk/samtools/SAMSequenceRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
package htsjdk.samtools;


import htsjdk.variant.variantcontext.VariantContext;

import java.math.BigInteger;
import java.net.URI;
import java.net.URISyntaxException;
Expand Down Expand Up @@ -57,23 +59,27 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea


/**
* This is not a valid sequence name, because it is reserved in the MRNM field of SAM text format
* This is not a valid sequence name, because it is reserved in the RNEXT field of SAM text format
* to mean "same reference as RNAME field."
*/
public static final String RESERVED_MRNM_SEQUENCE_NAME = "=";

public static final String RESERVED_RNEXT_SEQUENCE_NAME = "=";

/* use RESERVED_RNEXT_SEQUENCE_NAME instead. */
@Deprecated
public static final String RESERVED_MRNM_SEQUENCE_NAME = RESERVED_RNEXT_SEQUENCE_NAME;

/**
* The standard tags are stored in text header without type information, because the type of these tags is known.
*/
public static final Set<String> STANDARD_TAGS =
new HashSet<String>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG,
SPECIES_TAG));
new HashSet<>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG, SPECIES_TAG));

// Split on any whitespace
private static final Pattern SEQUENCE_NAME_SPLITTER = Pattern.compile("\\s");
// These are the chars matched by \\s.
private static final char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab

private static final Pattern LEGAL_RNAME_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*");

/**
* @deprecated Use {@link #SAMSequenceRecord(String, int)} instead.
* sequenceLength is required for the object to be considered valid.
Expand All @@ -85,9 +91,6 @@ public SAMSequenceRecord(final String name) {

public SAMSequenceRecord(final String name, final int sequenceLength) {
if (name != null) {
if (SEQUENCE_NAME_SPLITTER.matcher(name).find()) {
throw new SAMException("Sequence name contains invalid character: " + name);
}
validateSequenceName(name);
mSequenceName = name.intern();
} else {
Expand Down Expand Up @@ -188,8 +191,8 @@ public final SAMSequenceRecord clone() {
public static String truncateSequenceName(final String sequenceName) {
/*
* Instead of using regex split, do it manually for better performance.
return SEQUENCE_NAME_SPLITTER.split(sequenceName, 2)[0];
*/
*/

int truncateAt = sequenceName.length();
for (final char c : WHITESPACE_CHARS) {
int index = sequenceName.indexOf(c);
Expand All @@ -204,8 +207,8 @@ public static String truncateSequenceName(final String sequenceName) {
* Throw an exception if the sequence name is not valid.
*/
public static void validateSequenceName(final String name) {
if (RESERVED_MRNM_SEQUENCE_NAME.equals(name)) {
throw new SAMException("'" + RESERVED_MRNM_SEQUENCE_NAME + "' is not a valid sequence name");
if (!LEGAL_RNAME_PATTERN.matcher(name).useAnchoringBounds(true).matches()) {
throw new SAMException(String.format("Sequence name '%s' doesn't match regex: '%s' ", name, LEGAL_RNAME_PATTERN));
}
}

Expand Down
23 changes: 22 additions & 1 deletion src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,32 @@ public void testIsSameSequence(final SAMSequenceRecord rec1 , final SAMSequenceR
}

@Test
public void testSetAndCheckDescription(){
public void testSetAndCheckDescription() {
final SAMSequenceRecord record = new SAMSequenceRecord("Test", 1000);
Assert.assertNull(record.getDescription());
final String description = "A description.";
record.setDescription(description);
Assert.assertEquals(record.getDescription(), description);
}

@DataProvider
public Object[][] illegalSequenceNames(){
return new Object[][]{
{"space "},
{"comma,"},
{"lbrace["},
{"rbrace]"},
{"slash\\"},
{"smaller<"},
{"bigger<"},
{"lparen("},
{"rparen)"},
{"lbracket{"},
{"rbracket}"}};
}

@Test(dataProvider = "illegalSequenceNames", expectedExceptions = SAMException.class)
public void testIllegalSequenceNames(final String sequenceName){
new SAMSequenceRecord(sequenceName,100);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public class SequenceNameTruncationAndValidationTest extends HtsjdkTest {
@Test(expectedExceptions = {SAMException.class}, dataProvider = "badSequenceNames")
public void testSequenceRecordThrowsWhenInvalid(final String sequenceName) {
new SAMSequenceRecord(sequenceName, 123);
Assert.fail("Should not reach here.");
Assert.fail("Should not reach here. Sequence " + sequenceName + " should have failed.");
}

@DataProvider(name = "badSequenceNames")
Expand All @@ -53,7 +53,13 @@ public Object[][] badSequenceNames() {
{"\t"},
{"\n"},
{"="},
{"Hi, Mom!"}
{"Hi: Mom!"},
{"=Hi:Mom!"},
{"Hi:'Mom!"},
{"Hi:\"Mom!"},
{"Hi:)Mom!"},
{"Hi:(Mom!"},
{"Hi,@Mom!"}
};
}

Expand All @@ -65,7 +71,8 @@ public void testSequenceRecordPositiveTest(final String sequenceName) {
@DataProvider(name = "goodSequenceNames")
public Object[][] goodSequenceNames() {
return new Object[][]{
{"Hi,@Mom!"}
{"Hi:@Mom!"},
{"Hi:=Mom!"}
};
}

Expand Down

0 comments on commit 8cc1e37

Please sign in to comment.