From 8cc1e37ccb480fdbf9cbbae3e6a90aa0078de309 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Mon, 10 Dec 2018 15:50:44 -0500 Subject: [PATCH] disallowing bad characters in SamRecord names (#1238) * implementing the spec change in https://github.com/samtools/hts-specs/pull/333 * this disallows a number of characters from reference sequence names, these are characters like ',' which do not appear in standard references and which cause parsing issues if they are allowed --- .../htsjdk/samtools/SAMSequenceRecord.java | 29 ++++++++++--------- .../samtools/SAMSequenceRecordTest.java | 23 ++++++++++++++- ...quenceNameTruncationAndValidationTest.java | 13 +++++++-- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMSequenceRecord.java b/src/main/java/htsjdk/samtools/SAMSequenceRecord.java index 3db0c497f3..358bbf502f 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceRecord.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceRecord.java @@ -24,6 +24,8 @@ package htsjdk.samtools; +import htsjdk.variant.variantcontext.VariantContext; + import java.math.BigInteger; import java.net.URI; import java.net.URISyntaxException; @@ -57,23 +59,27 @@ public class SAMSequenceRecord extends AbstractSAMHeaderRecord implements Clonea /** - * This is not a valid sequence name, because it is reserved in the MRNM field of SAM text format + * This is not a valid sequence name, because it is reserved in the RNEXT field of SAM text format * to mean "same reference as RNAME field." */ - public static final String RESERVED_MRNM_SEQUENCE_NAME = "="; + + public static final String RESERVED_RNEXT_SEQUENCE_NAME = "="; + + /* use RESERVED_RNEXT_SEQUENCE_NAME instead. */ + @Deprecated + public static final String RESERVED_MRNM_SEQUENCE_NAME = RESERVED_RNEXT_SEQUENCE_NAME; /** * The standard tags are stored in text header without type information, because the type of these tags is known. */ public static final Set STANDARD_TAGS = - new HashSet(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG, - SPECIES_TAG)); + new HashSet<>(Arrays.asList(SEQUENCE_NAME_TAG, SEQUENCE_LENGTH_TAG, ASSEMBLY_TAG, MD5_TAG, URI_TAG, SPECIES_TAG)); - // Split on any whitespace - private static final Pattern SEQUENCE_NAME_SPLITTER = Pattern.compile("\\s"); // These are the chars matched by \\s. private static final char[] WHITESPACE_CHARS = {' ', '\t', '\n', '\013', '\f', '\r'}; // \013 is vertical tab + private static final Pattern LEGAL_RNAME_PATTERN = Pattern.compile("[0-9A-Za-z!#$%&+./:;?@^_|~-][0-9A-Za-z!#$%&*+./:;=?@^_|~-]*"); + /** * @deprecated Use {@link #SAMSequenceRecord(String, int)} instead. * sequenceLength is required for the object to be considered valid. @@ -85,9 +91,6 @@ public SAMSequenceRecord(final String name) { public SAMSequenceRecord(final String name, final int sequenceLength) { if (name != null) { - if (SEQUENCE_NAME_SPLITTER.matcher(name).find()) { - throw new SAMException("Sequence name contains invalid character: " + name); - } validateSequenceName(name); mSequenceName = name.intern(); } else { @@ -188,8 +191,8 @@ public final SAMSequenceRecord clone() { public static String truncateSequenceName(final String sequenceName) { /* * Instead of using regex split, do it manually for better performance. - return SEQUENCE_NAME_SPLITTER.split(sequenceName, 2)[0]; - */ + */ + int truncateAt = sequenceName.length(); for (final char c : WHITESPACE_CHARS) { int index = sequenceName.indexOf(c); @@ -204,8 +207,8 @@ public static String truncateSequenceName(final String sequenceName) { * Throw an exception if the sequence name is not valid. */ public static void validateSequenceName(final String name) { - if (RESERVED_MRNM_SEQUENCE_NAME.equals(name)) { - throw new SAMException("'" + RESERVED_MRNM_SEQUENCE_NAME + "' is not a valid sequence name"); + if (!LEGAL_RNAME_PATTERN.matcher(name).useAnchoringBounds(true).matches()) { + throw new SAMException(String.format("Sequence name '%s' doesn't match regex: '%s' ", name, LEGAL_RNAME_PATTERN)); } } diff --git a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java index 01d77d42fa..f9a10bcc91 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceRecordTest.java @@ -85,11 +85,32 @@ public void testIsSameSequence(final SAMSequenceRecord rec1 , final SAMSequenceR } @Test - public void testSetAndCheckDescription(){ + public void testSetAndCheckDescription() { final SAMSequenceRecord record = new SAMSequenceRecord("Test", 1000); Assert.assertNull(record.getDescription()); final String description = "A description."; record.setDescription(description); Assert.assertEquals(record.getDescription(), description); } + + @DataProvider + public Object[][] illegalSequenceNames(){ + return new Object[][]{ + {"space "}, + {"comma,"}, + {"lbrace["}, + {"rbrace]"}, + {"slash\\"}, + {"smaller<"}, + {"bigger<"}, + {"lparen("}, + {"rparen)"}, + {"lbracket{"}, + {"rbracket}"}}; + } + + @Test(dataProvider = "illegalSequenceNames", expectedExceptions = SAMException.class) + public void testIllegalSequenceNames(final String sequenceName){ + new SAMSequenceRecord(sequenceName,100); + } } diff --git a/src/test/java/htsjdk/samtools/SequenceNameTruncationAndValidationTest.java b/src/test/java/htsjdk/samtools/SequenceNameTruncationAndValidationTest.java index 01999c481c..80f9997ce3 100644 --- a/src/test/java/htsjdk/samtools/SequenceNameTruncationAndValidationTest.java +++ b/src/test/java/htsjdk/samtools/SequenceNameTruncationAndValidationTest.java @@ -43,7 +43,7 @@ public class SequenceNameTruncationAndValidationTest extends HtsjdkTest { @Test(expectedExceptions = {SAMException.class}, dataProvider = "badSequenceNames") public void testSequenceRecordThrowsWhenInvalid(final String sequenceName) { new SAMSequenceRecord(sequenceName, 123); - Assert.fail("Should not reach here."); + Assert.fail("Should not reach here. Sequence " + sequenceName + " should have failed."); } @DataProvider(name = "badSequenceNames") @@ -53,7 +53,13 @@ public Object[][] badSequenceNames() { {"\t"}, {"\n"}, {"="}, - {"Hi, Mom!"} + {"Hi: Mom!"}, + {"=Hi:Mom!"}, + {"Hi:'Mom!"}, + {"Hi:\"Mom!"}, + {"Hi:)Mom!"}, + {"Hi:(Mom!"}, + {"Hi,@Mom!"} }; } @@ -65,7 +71,8 @@ public void testSequenceRecordPositiveTest(final String sequenceName) { @DataProvider(name = "goodSequenceNames") public Object[][] goodSequenceNames() { return new Object[][]{ - {"Hi,@Mom!"} + {"Hi:@Mom!"}, + {"Hi:=Mom!"} }; }