From a8962fdd53006659268849c5b0cd81e7f92aa859 Mon Sep 17 00:00:00 2001 From: Michael L Heuer Date: Thu, 5 Jul 2018 15:11:40 -0500 Subject: [PATCH] Expand Illumina metadata regex to cover interleaved index sequences. --- .../adam/io/InterleavedFastqInputFormat.java | 2 +- ..._names_with_index_sequences_interleaved.fq | 32 +++++++++++++++++++ .../adam/rdd/fragment/FragmentRDDSuite.scala | 24 +++++++++++++- 3 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 adam-core/src/test/resources/read_names_with_index_sequences_interleaved.fq diff --git a/adam-core/src/main/java/org/bdgenomics/adam/io/InterleavedFastqInputFormat.java b/adam-core/src/main/java/org/bdgenomics/adam/io/InterleavedFastqInputFormat.java index 6b33c8433b..c5ab4e2b72 100755 --- a/adam-core/src/main/java/org/bdgenomics/adam/io/InterleavedFastqInputFormat.java +++ b/adam-core/src/main/java/org/bdgenomics/adam/io/InterleavedFastqInputFormat.java @@ -58,7 +58,7 @@ public final class InterleavedFastqInputFormat extends FastqInputFormat { */ private static class InterleavedFastqRecordReader extends FastqRecordReader { - private final String firstReadSuffix = ".+([/ +_]1| 1:[YN]:[02468]+:[0-9]+)$"; + private final String firstReadSuffix = ".+([/ +_]1| 1:[YN]:[02468]+:[0-9ACTG]+)$"; private Pattern firstReadRegex; InterleavedFastqRecordReader(final Configuration conf, diff --git a/adam-core/src/test/resources/read_names_with_index_sequences_interleaved.fq b/adam-core/src/test/resources/read_names_with_index_sequences_interleaved.fq new file mode 100644 index 0000000000..65b99fe4b4 --- /dev/null +++ b/adam-core/src/test/resources/read_names_with_index_sequences_interleaved.fq @@ -0,0 +1,32 @@ +@HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 1:N:0:ATCACG +TCTGTGTAAATTACCCAGCCTCACGTATTCCTTTAGAGCAATGCAAAACAGACTAGACAAAAGGCTTTTAAAAGTCTAATCTGAGATTCCTGACCAAATGT ++ +CCCFFFFFHHHHHJJJJJJJJJJJJHIJJJJJJJJJIJJJJJJJJJJJJJJJJJJJHIJGHJIJJIJJJJJHHHHHHHFFFFFFFEDDEEEEDDDDDDDDD +@HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 2:N:0:ATCACG +NTAATAATGAGTGCACAATAGTTTTTCTCCTGAAACATAATTATTCTCTCAATCATCCCCATCCCCACCAAAGTCAATCACGGGAAGATCAATCAGCCTGC ++ +#1=DFFFFHHHFHIJJIIIJJJIJJJIJJJJJJJIJJJJJJIJJJJJJJJJJJJJIJJJJJJJJJJIJIJIJJHHHHHHHFFFDDDDDDDDDDDDDDDDDD +@HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 1:N:0:ATCACG +AAGCTGACTTGTGTTGGGAGCTCATCTGTTCCCTTGACTTCTCTTTTTCCAGTTCTTCGTCAAGGCCACAGGTGCTGCGGGAAAATCAGTAACTAATGAAC ++ +@C@FFFDFFFDFBHGDHH;EHHGHE?EBHCHIGGI>BFFECGGIIIIGIHEBBGCHHIG);;FEEADGH@CCDCCC@CCC +@HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 2:N:0:ATCACG +TCCTCCCACTTCTGTCTCCCTCAGCAGCCTCTCATATTGCTGCTGTCTGCCTGGCCTATAGGCTTCTGAGTTATGACACTGGTGTGAAGAGAAAAGGCTTN ++ +1?@DABDDDDF+AE?EBFHIIII>G>?;8?3?EEFB;5<5(: +@HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 1:N:0:ATCACG +AGCTGACATGAGAAAAGCCTGGTAAATCCGGGGCAAGTGACTGAAATGAAAGAATCCAATCAGATTCCAGCTCCAAGGGCCGCTAATTGTAGTAACTGGCT ++ +CCCFDFFFHFFFFGIIGGHJEB@DDEDE@>?? +@HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 2:N:0:ATCACG +ATATTAAGCCACTTGCAGCAAGACAGCCTGAAACTTCGTGACTCCCTGGAGCTTTTGGTGGTGGACGAAGCTGACCTTCTTTTTTCCTTTGGCTTTGANNN ++ +C@CFDFFFHGHGHIJJJIGHHHIJJJIIEHIJJJJJIHDFGBFGIIJGGHIGIJJGEHCFH@EHEEEDFCDCDDDDDDDDDDDDDDDDDDCDDCDDCDDCC +@HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 1:N:0:ATCACG +TCATTCCACATCTCAATCTCTCCTAGGAAGTTTTCCGGCCTTGTTGACAGGTTTAATTGAAAGGAGAAGCCAAATGTTGAGTAAACAGATTGCAAAAACTG ++ +CCCFFFFFHHFHHJJJJJIJIIJHJJJJJJIIJIJJGIIIIJIJJJJJJJJBFIIJJJJJJJJJJJJJJHHHHHFF?DFFFEEEEEEDDCDDDDDDDDDDC +@HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 2:N:0:ATCACG +NNNNNNNNNNNNNNNNNNNNNGGATAANNANCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN ++ +#####################22@@??##1#0############.############################################++8::<====== diff --git a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala index 312ad6ae40..91f0522130 100644 --- a/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala +++ b/adam-core/src/test/scala/org/bdgenomics/adam/rdd/fragment/FragmentRDDSuite.scala @@ -577,7 +577,7 @@ class FragmentRDDSuite extends ADAMFunSuite { checkSave(variantContexts) } - sparkTest("Paired read names with index sequences in read names can grouped into fragments") { + sparkTest("paired read names with index sequences in read names can group into fragments") { val path1 = testFile("read_names_with_index_sequences_pair1.fq") val path2 = testFile("read_names_with_index_sequences_pair2.fq") val fragments = sc.loadPairedFastq(path1, path2).toFragments() @@ -588,4 +588,26 @@ class FragmentRDDSuite extends ADAMFunSuite { assert(fragment.getAlignments.size() == 2) }) } + + sparkTest("interleaved paired read names with index sequences in read names can group into fragments") { + val path = testFile("read_names_with_index_sequences_interleaved.fq") + val fragments = sc.loadInterleavedFastq(path).toFragments() + + assert(fragments.rdd.count() == 4) + + fragments.rdd.collect().foreach(fragment => { + assert(fragment.getAlignments.size() == 2) + }) + } + + sparkTest("interleaved paired read names with index sequences in read names as fragments") { + val path = testFile("read_names_with_index_sequences_interleaved.fq") + val fragments = sc.loadInterleavedFastqAsFragments(path) + + assert(fragments.rdd.count() == 4) + + fragments.rdd.collect().foreach(fragment => { + assert(fragment.getAlignments.size() == 2) + }) + } }