Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ADAM-1697] Expand Illumina metadata regex to cover interleaved index sequences. #2010

Merged
merged 1 commit into from
Jul 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public final class InterleavedFastqInputFormat extends FastqInputFormat {
*/
private static class InterleavedFastqRecordReader extends FastqRecordReader {

private final String firstReadSuffix = ".+([/ +_]1| 1:[YN]:[02468]+:[0-9]+)$";
private final String firstReadSuffix = ".+([/ +_]1| 1:[YN]:[02468]+:[0-9ACTG]+)$";
private Pattern firstReadRegex;

InterleavedFastqRecordReader(final Configuration conf,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
@HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 1:N:0:ATCACG
TCTGTGTAAATTACCCAGCCTCACGTATTCCTTTAGAGCAATGCAAAACAGACTAGACAAAAGGCTTTTAAAAGTCTAATCTGAGATTCCTGACCAAATGT
+
CCCFFFFFHHHHHJJJJJJJJJJJJHIJJJJJJJJJIJJJJJJJJJJJJJJJJJJJHIJGHJIJJIJJJJJHHHHHHHFFFFFFFEDDEEEEDDDDDDDDD
@HISEQ_HU01:89:H7YRLADXX:1:1101:1116:2123 2:N:0:ATCACG
NTAATAATGAGTGCACAATAGTTTTTCTCCTGAAACATAATTATTCTCTCAATCATCCCCATCCCCACCAAAGTCAATCACGGGAAGATCAATCAGCCTGC
+
#1=DFFFFHHHFHIJJIIIJJJIJJJIJJJJJJJIJJJJJJIJJJJJJJJJJJJJIJJJJJJJJJJIJIJIJJHHHHHHHFFFDDDDDDDDDDDDDDDDDD
@HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 1:N:0:ATCACG
AAGCTGACTTGTGTTGGGAGCTCATCTGTTCCCTTGACTTCTCTTTTTCCAGTTCTTCGTCAAGGCCACAGGTGCTGCGGGAAAATCAGTAACTAATGAAC
+
@C@FFFDFFFDFBHGDHH;EHHGHE?EBHCHIGGI>BFFECGGIIIIGIHEBBGCHHIG);;FEEADGH<ACAEDDBCCCBBCCCBCCC>@CCDCCC@CCC
@HISEQ_HU01:89:H7YRLADXX:1:1101:1129:2182 2:N:0:ATCACG
TCCTCCCACTTCTGTCTCCCTCAGCAGCCTCTCATATTGCTGCTGTCTGCCTGGCCTATAGGCTTCTGAGTTATGACACTGGTGTGAAGAGAAAAGGCTTN
+
1?@DABDDDDF+AE?EBFHIIII>G>?;8?3?EEF<FF9@<DFGDD9?D93?BD889=)=<C3=C)=7DAD77=?7?A:EB2?96;@DB@?=;>B;5<5(:
@HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 1:N:0:ATCACG
AGCTGACATGAGAAAAGCCTGGTAAATCCGGGGCAAGTGACTGAAATGAAAGAATCCAATCAGATTCCAGCTCCAAGGGCCGCTAATTGTAGTAACTGGCT
+
CCCFDFFFHFFFFGIIGGHJEB<CEFIJJIIIHGECG9BFDHGGIGHJFHIGEHHIIAHHGGEEFDDDFDBCEDECCBB@5=BDDDDEC:>@DDEDE@>??
@HISEQ_HU01:89:H7YRLADXX:1:1101:1158:2217 2:N:0:ATCACG
ATATTAAGCCACTTGCAGCAAGACAGCCTGAAACTTCGTGACTCCCTGGAGCTTTTGGTGGTGGACGAAGCTGACCTTCTTTTTTCCTTTGGCTTTGANNN
+
C@CFDFFFHGHGHIJJJIGHHHIJJJIIEHIJJJJJIHDFGBFGIIJGGHIGIJJGEHCFH@EHEEEDFCDCDDDDDDDDDDDDDDDDDDCDDCDDCDDCC
@HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 1:N:0:ATCACG
TCATTCCACATCTCAATCTCTCCTAGGAAGTTTTCCGGCCTTGTTGACAGGTTTAATTGAAAGGAGAAGCCAAATGTTGAGTAAACAGATTGCAAAAACTG
+
CCCFFFFFHHFHHJJJJJIJIIJHJJJJJJIIJIJJGIIIIJIJJJJJJJJBFIIJJJJJJJJJJJJJJHHHHHFF?DFFFEEEEEEDDCDDDDDDDDDDC
@HISEQ_HU01:89:H7YRLADXX:1:1101:1499:2087 2:N:0:ATCACG
NNNNNNNNNNNNNNNNNNNNNGGATAANNANCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+
#####################22@@??##1#0############.############################################++8::<======
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ class FragmentRDDSuite extends ADAMFunSuite {
checkSave(variantContexts)
}

sparkTest("Paired read names with index sequences in read names can grouped into fragments") {
sparkTest("paired read names with index sequences in read names can group into fragments") {
val path1 = testFile("read_names_with_index_sequences_pair1.fq")
val path2 = testFile("read_names_with_index_sequences_pair2.fq")
val fragments = sc.loadPairedFastq(path1, path2).toFragments()
Expand All @@ -588,4 +588,26 @@ class FragmentRDDSuite extends ADAMFunSuite {
assert(fragment.getAlignments.size() == 2)
})
}

sparkTest("interleaved paired read names with index sequences in read names can group into fragments") {
val path = testFile("read_names_with_index_sequences_interleaved.fq")
val fragments = sc.loadInterleavedFastq(path).toFragments()

assert(fragments.rdd.count() == 4)

fragments.rdd.collect().foreach(fragment => {
assert(fragment.getAlignments.size() == 2)
})
}

sparkTest("interleaved paired read names with index sequences in read names as fragments") {
val path = testFile("read_names_with_index_sequences_interleaved.fq")
val fragments = sc.loadInterleavedFastqAsFragments(path)

assert(fragments.rdd.count() == 4)

fragments.rdd.collect().foreach(fragment => {
assert(fragment.getAlignments.size() == 2)
})
}
}