diff --git a/README.md b/README.md index 28e89bf..36c6f82 100644 --- a/README.md +++ b/README.md @@ -12,33 +12,39 @@ Needletail's goal is to be as fast as the [readfq](https://github.com/lh3/readfq ```rust extern crate needletail; +use needletail::{parse_sequences, Sequence}; use std::env; use std::fs::File; -use needletail::parse_sequences; fn main() { - let filename: String = env::args().nth(1).unwrap(); - - let mut n_bases = 0; - let mut n_valid_kmers = 0; - parse_sequences(File::open(filename).expect("missing file"), |_| {}, |seq| { - // seq.id is the name of the record - // seq.seq is the base sequence - // seq.qual is an optional quality score - - // keep track of the total number of bases - n_bases += seq.seq.len(); - - // keep track of the number of AAAA (or TTTT via canonicalization) in the - // file (normalize makes sure ever base is capitalized for comparison) - for (_, kmer, _) in seq.normalize(false).kmers(4, true) { - if kmer == b"AAAA" { - n_valid_kmers += 1; - } - } - }).expect("parsing failed"); - println!("There are {} bases in your file.", n_bases); - println!("There are {} AAAAs in your file.", n_valid_kmers); + let filename: String = env::args().nth(1).unwrap(); + + let mut n_bases = 0; + let mut n_valid_kmers = 0; + parse_sequences( + File::open(filename).expect("missing file"), + |_| {}, + |seq| { + // seq.id is the name of the record + // seq.seq is the base sequence + // seq.qual is an optional quality score + + // keep track of the total number of bases + n_bases += seq.seq.len(); + + // keep track of the number of AAAA (or TTTT via canonicalization) in the + // file (normalize makes sure ever base is capitalized for comparison) + let rc = seq.reverse_complement(); + for (_, kmer, _) in seq.normalize(false).canonical_kmers(4, &rc) { + if kmer == b"AAAA" { + n_valid_kmers += 1; + } + } + }, + ) + .expect("parsing failed"); + println!("There are {} bases in your file.", n_bases); + println!("There are {} AAAAs in your file.", n_valid_kmers); } ``` diff --git a/benches/benchmark.rs b/benches/benchmark.rs index 3a02877..0ec83a8 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -4,6 +4,7 @@ extern crate needletail; use criterion::Criterion; use needletail::parse_sequences; +use needletail::seq::Sequence; use std::fs::File; use std::io::{Cursor, Read}; @@ -28,8 +29,10 @@ fn bench_kmer_speed(c: &mut Criterion) { parse_sequences( fasta_data, |_| {}, - |seq| { - for (_, _kmer, was_rc) in seq.normalize(true).kmers(ksize, true) { + |rec| { + let seq = rec.seq.normalize(true); + let rc = seq.reverse_complement(); + for (_, _kmer, was_rc) in seq.canonical_kmers(ksize, &rc) { if !was_rc { n_canonical += 1; } @@ -192,13 +195,11 @@ fn bench_fasta_file(c: &mut Criterion) { group.bench_function("Needletail (No Buffer)", |bench| { use needletail::formats::{FastaParser, RecParser}; - use needletail::seq::Sequence; bench.iter(|| { let mut reader = FastaParser::from_buffer(&data, true); let mut n_bases = 0; for rec in reader.by_ref() { - let seq = Sequence::from(rec.unwrap()); - n_bases += seq.seq.len(); + n_bases += rec.unwrap().seq.strip_returns().len(); } assert_eq!(738_580, n_bases); }); diff --git a/src/bitkmer.rs b/src/bitkmer.rs index 971617b..4c56276 100644 --- a/src/bitkmer.rs +++ b/src/bitkmer.rs @@ -63,7 +63,7 @@ impl<'a> BitNuclKmer<'a> { pub fn new(slice: &'a [u8], k: u8, canonical: bool) -> BitNuclKmer<'a> { let mut kmer = (0u64, k); let mut start_pos = 0; - update_position(&mut start_pos, &mut kmer, slice, true); + update_position(&mut start_pos, &mut kmer, &slice, true); BitNuclKmer { start_pos, @@ -78,7 +78,7 @@ impl<'a> Iterator for BitNuclKmer<'a> { type Item = (usize, BitKmer, bool); fn next(&mut self) -> Option<(usize, BitKmer, bool)> { - if !update_position(&mut self.start_pos, &mut self.cur_kmer, self.buffer, false) { + if !update_position(&mut self.start_pos, &mut self.cur_kmer, &self.buffer, false) { return None; } self.start_pos += 1; diff --git a/src/formats/fasta.rs b/src/formats/fasta.rs index 642df02..ae0738a 100644 --- a/src/formats/fasta.rs +++ b/src/formats/fasta.rs @@ -4,8 +4,8 @@ use std::io::Write; use memchr::memchr; use crate::formats::buffer::RecParser; -use crate::seq::Sequence; -use crate::util::{memchr_both_last, strip_whitespace, ParseError, ParseErrorType}; +use crate::seq::{Sequence, SequenceRecord}; +use crate::util::{memchr_both_last, ParseError, ParseErrorType}; #[derive(Debug)] pub struct FastaRecord<'a> { @@ -14,28 +14,25 @@ pub struct FastaRecord<'a> { } impl<'a> FastaRecord<'a> { - pub fn write(&self, writer: &mut dyn Write) -> Result<(), ParseError> { + pub fn write(&self, writer: &mut dyn Write, ending: &[u8]) -> Result<(), ParseError> { writer.write_all(b">")?; writer.write_all(&self.id)?; - writer.write_all(b"\n")?; + writer.write_all(ending)?; writer.write_all(&self.seq)?; - writer.write_all(b"\n")?; + writer.write_all(ending)?; Ok(()) } } -impl<'a> From> for Sequence<'a> { - fn from(fasta: FastaRecord<'a>) -> Sequence<'a> { - Sequence::new(fasta.id, strip_whitespace(fasta.seq), None) +impl<'a> Sequence<'a> for FastaRecord<'a> { + fn sequence(&self) -> &'a [u8] { + self.seq } } -impl<'a> From<&'a Sequence<'a>> for FastaRecord<'a> { - fn from(seq: &'a Sequence<'a>) -> FastaRecord<'a> { - FastaRecord { - id: &seq.id, - seq: &seq.seq, - } +impl<'a> From> for SequenceRecord<'a> { + fn from(fasta: FastaRecord<'a>) -> SequenceRecord<'a> { + SequenceRecord::new(fasta.id.into(), fasta.seq.into(), None) } } @@ -95,8 +92,11 @@ impl<'a> Iterator for FastaParser<'a> { .context(context))); } let mut seq = &buf[id_end..seq_end]; - if seq[seq.len() - 1] == b'\r' { - seq = &seq[..seq.len()]; + if seq.len() > 0 && seq[seq.len() - 1] == b'\n' { + seq = &seq[..seq.len() - 1]; + } + if seq.len() > 0 && seq[seq.len() - 1] == b'\r' { + seq = &seq[..seq.len() - 1]; } self.pos += seq_end; diff --git a/src/formats/fastq.rs b/src/formats/fastq.rs index 7380f79..d32501a 100644 --- a/src/formats/fastq.rs +++ b/src/formats/fastq.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::cmp::min; use std::io::Write; @@ -6,7 +5,7 @@ use memchr::memchr; use crate::formats::buffer::RecParser; use crate::formats::fasta::check_end; -use crate::seq::Sequence; +use crate::seq::{Sequence, SequenceRecord}; use crate::util::{memchr_both, ParseError, ParseErrorType}; #[derive(Debug)] @@ -18,45 +17,36 @@ pub struct FastqRecord<'a> { } impl<'a> FastqRecord<'a> { - pub fn write(&self, writer: &mut dyn Write) -> Result<(), ParseError> { + pub fn write(&self, writer: &mut dyn Write, ending: &[u8]) -> Result<(), ParseError> { writer.write_all(b"@")?; writer.write_all(&self.id)?; - writer.write_all(b"\n")?; + writer.write_all(ending)?; writer.write_all(&self.seq)?; - writer.write_all(b"\n+\n")?; + writer.write_all(ending)?; + writer.write_all(b"+")?; + writer.write_all(ending)?; + // this is kind of a hack, but we want to allow writing out sequences + // that don't have qualitys so this will mask to "good" if the quality + // slice is empty if self.seq.len() != self.qual.len() { writer.write_all(&vec![b'I'; self.seq.len()])?; } else { writer.write_all(&self.qual)?; } - writer.write_all(b"\n")?; + writer.write_all(ending)?; Ok(()) } } -impl<'a> From> for Sequence<'a> { - fn from(fastq: FastqRecord<'a>) -> Sequence<'a> { - let qual = if fastq.seq.len() != fastq.qual.len() { - None - } else { - Some(fastq.qual) - }; - Sequence::new(fastq.id, Cow::from(fastq.seq), qual) +impl<'a> Sequence<'a> for FastqRecord<'a> { + fn sequence(&self) -> &'a [u8] { + self.seq } } -impl<'a> From<&'a Sequence<'a>> for FastqRecord<'a> { - fn from(seq: &'a Sequence<'a>) -> FastqRecord<'a> { - let qual = match &seq.qual { - None => &b""[..], - Some(q) => &q, - }; - FastqRecord { - id: &seq.id, - seq: &seq.seq, - id2: b"", - qual, - } +impl<'a> From> for SequenceRecord<'a> { + fn from(fastq: FastqRecord<'a>) -> SequenceRecord<'a> { + SequenceRecord::new(fastq.id.into(), fastq.seq.into(), Some(fastq.qual.into())) } } diff --git a/src/formats/mod.rs b/src/formats/mod.rs index 4dfe64a..e56786b 100644 --- a/src/formats/mod.rs +++ b/src/formats/mod.rs @@ -29,7 +29,7 @@ use xz2::read::XzDecoder; pub use crate::formats::buffer::{RecBuffer, RecParser}; pub use crate::formats::fasta::{FastaParser, FastaRecord}; pub use crate::formats::fastq::{FastqParser, FastqRecord}; -use crate::seq::Sequence; +use crate::seq::SequenceRecord; use crate::util::{ParseError, ParseErrorType}; #[macro_export] @@ -71,7 +71,7 @@ fn seq_reader( type_callback: &mut T, ) -> Result<(), ParseError> where - F: for<'a> FnMut(Sequence<'a>) -> (), + F: for<'a> FnMut(SequenceRecord<'a>) -> (), R: Read, T: ?Sized + FnMut(&'static str) -> (), { @@ -91,10 +91,10 @@ where match file_type { "FASTA" => parse_stream!(reader, first, FastaParser, rec, { - callback(Sequence::from(rec)) + callback(SequenceRecord::from(rec)) }), "FASTQ" => parse_stream!(reader, first, FastqParser, rec, { - callback(Sequence::from(rec)) + callback(SequenceRecord::from(rec)) }), _ => panic!("A file type was inferred that could not be parsed"), }; @@ -108,7 +108,7 @@ pub fn parse_sequences( callback: F, ) -> Result<(), ParseError> where - F: for<'a> FnMut(Sequence<'a>) -> (), + F: for<'a> FnMut(SequenceRecord<'a>) -> (), R: Read, T: FnMut(&'static str) -> (), { @@ -124,7 +124,7 @@ pub fn parse_sequences( callback: F, ) -> Result<(), ParseError> where - F: for<'a> FnMut(Sequence<'a>) -> (), + F: for<'a> FnMut(SequenceRecord<'a>) -> (), R: Read, T: FnMut(&'static str) -> (), { diff --git a/src/kmer.rs b/src/kmer.rs index 4d6162f..5f74ae1 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -101,75 +101,103 @@ pub fn is_good_base(chr: u8) -> bool { } } -pub struct NuclKmer<'a> { +pub struct Kmers<'a> { k: u8, start_pos: usize, buffer: &'a [u8], - rc_buffer: Option<&'a [u8]>, } -fn update_position(start_pos: &mut usize, k: u8, buffer: &[u8], initial: bool) -> bool { - // check if we have enough "physical" space for one more kmer - if *start_pos + k as usize > buffer.len() { - return false; +impl<'a> Kmers<'a> { + //! A kmer-izer for a nucleotide/amino acid sequence; returning slices to the original data + pub fn new(buffer: &'a [u8], k: u8) -> Self { + Kmers { + k, + start_pos: 0, + buffer, + } } +} - let (mut kmer_len, stop_len) = if initial { - (0, (k - 1) as usize) - } else { - ((k - 1) as usize, k as usize) - }; +impl<'a> Iterator for Kmers<'a> { + type Item = &'a [u8]; - while kmer_len < stop_len { - if is_good_base(buffer[*start_pos + kmer_len]) { - kmer_len += 1; - } else { - kmer_len = 0; - *start_pos += kmer_len + 1; - if *start_pos + k as usize > buffer.len() { - return false; - } + fn next(&mut self) -> Option { + if self.start_pos + self.k as usize > self.buffer.len() { + return None; } + let pos = self.start_pos; + self.start_pos += 1; + Some(&self.buffer[pos..pos + self.k as usize]) } - true } -impl<'a> NuclKmer<'a> { - //! A kmer-izer for a nucleotide/amino acid sequence; returning slices to the original data - pub fn new(buffer: &'a [u8], rc_buffer: Option<&'a [u8]>, k: u8) -> NuclKmer<'a> { - let mut start_pos = 0; - update_position(&mut start_pos, k, buffer, true); - NuclKmer { +pub struct CanonicalKmers<'a> { + k: u8, + start_pos: usize, + buffer: &'a [u8], + rc_buffer: &'a [u8], +} + +/// A kmer-izer for a nucleotide acid sequences to return canonical kmers. +/// Returns the position of the kmer, a slice to the original data, and +/// an boolean indicating if the kmer returned is the original or the reverse +/// complement. +impl<'a> CanonicalKmers<'a> { + pub fn new(buffer: &'a [u8], rc_buffer: &'a [u8], k: u8) -> Self { + let mut nucl_kmers = CanonicalKmers { k, - start_pos, + start_pos: 0, buffer, rc_buffer, + }; + nucl_kmers.update_position(true); + nucl_kmers + } + + fn update_position(&mut self, initial: bool) -> bool { + // check if we have enough "physical" space for one more kmer + if self.start_pos + self.k as usize > self.buffer.len() { + return false; } + + let (mut kmer_len, stop_len) = if initial { + (0, (self.k - 1) as usize) + } else { + ((self.k - 1) as usize, self.k as usize) + }; + + while kmer_len < stop_len { + if is_good_base(self.buffer[self.start_pos + kmer_len]) { + kmer_len += 1; + } else { + kmer_len = 0; + self.start_pos += kmer_len + 1; + if self.start_pos + self.k as usize > self.buffer.len() { + return false; + } + } + } + true } } -impl<'a> Iterator for NuclKmer<'a> { +impl<'a> Iterator for CanonicalKmers<'a> { type Item = (usize, &'a [u8], bool); fn next(&mut self) -> Option<(usize, &'a [u8], bool)> { - if !update_position(&mut self.start_pos, self.k, self.buffer, false) { + if !self.update_position(false) { return None; } let pos = self.start_pos; self.start_pos += 1; let result = &self.buffer[pos..pos + self.k as usize]; - match self.rc_buffer { - None => Some((pos, result, false)), - Some(rc_buffer) => { - let rc_result = - &rc_buffer[rc_buffer.len() - pos - self.k as usize..rc_buffer.len() - pos]; - if result < rc_result { - Some((pos, result, false)) - } else { - Some((pos, rc_result, true)) - } - } + let rc_buffer = self.rc_buffer; + let rc_result = &rc_buffer[rc_buffer.len() - pos - self.k as usize..rc_buffer.len() - pos]; + if result < rc_result { + Some((pos, result, false)) + } else { + Some((pos, rc_result, true)) } } } diff --git a/src/lib.rs b/src/lib.rs index 146243d..460da28 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,4 +6,5 @@ pub mod seq; mod util; pub use formats::parse_sequences; +pub use seq::{Sequence, SequenceRecord}; pub use util::{ParseError, ParseErrorType}; diff --git a/src/seq.rs b/src/seq.rs index 7a79e72..807083a 100644 --- a/src/seq.rs +++ b/src/seq.rs @@ -1,9 +1,9 @@ use std::borrow::Cow; -use memchr::memchr; +use memchr::{memchr, memchr2}; use crate::bitkmer::BitNuclKmer; -use crate::kmer::{complement, NuclKmer}; +use crate::kmer::{complement, CanonicalKmers, Kmers}; pub fn normalize(seq: &[u8], allow_iupac: bool) -> Option> { //! Transform a FASTX sequence into it's "normalized" form. @@ -110,131 +110,194 @@ pub fn mask_header_utf8(id: &[u8]) -> Option> { } } -/// A generic FASTX record that also abstracts over several logical operations -/// that can be performed on nucleic acid sequences. -#[derive(Clone, Debug)] -pub struct Sequence<'a> { +pub struct SequenceRecord<'a> { pub id: Cow<'a, [u8]>, pub seq: Cow<'a, [u8]>, pub qual: Option>, - rev_seq: Option>, } -impl<'a> Sequence<'a> { - pub fn new(id: &'a [u8], seq: Cow<'a, [u8]>, qual: Option<&'a [u8]>) -> Self { - Sequence { - id: id.into(), - seq, - qual: qual.map(Cow::Borrowed), - rev_seq: None, +impl<'a> SequenceRecord<'a> { + pub fn new(id: Cow<'a, [u8]>, seq: Cow<'a, [u8]>, qual: Option>) -> Self { + // there has to be a better way to do this? + let cleaned_seq = match seq.strip_returns() { + Cow::Owned(s) => Cow::Owned(s), + Cow::Borrowed(_) => seq, + }; + SequenceRecord { + id, + seq: cleaned_seq, + qual, } } +} + +impl<'a> From<&'a [u8]> for SequenceRecord<'a> { + fn from(slice: &'a [u8]) -> Self { + SequenceRecord::new(Cow::from(&b""[..]), slice.into(), None) + } +} + +/// A generic FASTX record that also abstracts over several logical operations +/// that can be performed on nucleic acid sequences. +pub trait Sequence<'a> { + fn sequence(&'a self) -> &'a [u8]; + + /// remove newlines from within FASTX records; currently the rate limiting step + /// in FASTX parsing (in general; readfq also exhibits this behavior) + fn strip_returns(&'a self) -> Cow<'a, [u8]> { + let seq = self.sequence(); + + // first part is a fast check to see if we need to do any allocations + let mut i; + match memchr2(b'\r', b'\n', &seq) { + Some(break_loc) => i = break_loc, + None => return seq.into(), + } + // we found a newline; create a new buffer and stripping out newlines + // and writing into it + let mut new_buf = Vec::with_capacity(seq.len() - 1); + new_buf.extend_from_slice(&seq[..i]); + while i < seq.len() { + match memchr2(b'\r', b'\n', &seq[i..]) { + None => { + new_buf.extend_from_slice(&seq[i..]); + break; + } + Some(match_pos) => { + new_buf.extend_from_slice(&seq[i..i + match_pos]); + i += match_pos + 1; + } + } + } + new_buf.into() + } + + fn reverse_complement(&'a self) -> Vec { + self.sequence() + .iter() + .rev() + .map(|n| complement(*n)) + .collect() + } - pub fn from_bytes(seq: &'a [u8]) -> Self { - Sequence { - id: b""[..].into(), - seq: seq.into(), - qual: None, - rev_seq: None, + fn normalize(&'a self, iupac: bool) -> Cow<'a, [u8]> { + if let Some(s) = normalize(&self.sequence(), iupac) { + s.into() + } else { + self.sequence().into() } } + fn canonical_kmers(&'a self, k: u8, reverse_complement: &'a [u8]) -> CanonicalKmers<'a> { + CanonicalKmers::new(self.sequence().as_ref(), reverse_complement, k) + } + + fn kmers(&'a self, k: u8) -> Kmers<'a> { + Kmers::new(self.sequence().as_ref(), k) + } + + /// Return an iterator the returns valid kmers in 4-bit form + fn bit_kmers(&'a self, k: u8, canonical: bool) -> BitNuclKmer<'a> { + BitNuclKmer::new(self.sequence(), k, canonical) + } +} + +impl<'a> Sequence<'a> for &'a [u8] { + fn sequence(&'a self) -> &'a [u8] { + &self + } +} + +impl<'a> Sequence<'a> for [u8] { + fn sequence(&'a self) -> &'a [u8] { + &self + } +} + +impl<'a> Sequence<'a> for Cow<'a, [u8]> { + fn sequence(&'a self) -> &'a [u8] { + &self + } +} + +impl<'a> Sequence<'a> for SequenceRecord<'a> { + fn sequence(&'a self) -> &'a [u8] { + self.seq.as_ref() + } +} + +pub trait QualitySequence<'a>: Sequence<'a> { + fn quality(&'a self) -> &'a [u8]; + /// Given a SeqRecord and a quality cutoff, mask out low-quality bases with /// `N` characters. /// /// Experimental. - pub fn quality_mask(self, score: u8) -> Self { - if self.qual == None { - return self; - } - let qual = self.qual.unwrap().into_owned(); + fn quality_mask(&'a self, score: u8) -> Cow<'a, [u8]> { + let qual = self.quality(); // could maybe speed this up by doing a copy of base and then // iterating though qual and masking? - let seq = self - .seq + let seq: Vec = self + .sequence() .iter() .zip(qual.iter()) .map(|(base, qual)| if *qual < score { b'N' } else { *base }) .collect(); - Sequence { - id: self.id, - seq, - qual: Some(Cow::Owned(qual)), - rev_seq: None, - } - } - - /// Capitalize everything and mask unknown bases to N - pub fn normalize(mut self, iupac: bool) -> Self { - if let Some(seq) = normalize(&self.seq, iupac) { - self.seq = seq.into(); - } - self + seq.into() } +} - /// Fixes up potential problems with sequence headers including tabs being - /// present (may break downstream analyses with headers in TSVs) and with - /// non-UTF8 characters being present, e.g. non-breaking spaces on Windows - /// encodings (0x0A) breaks some tools. - pub fn mask_header(mut self) -> Self { - if let Some(id) = mask_header_tabs(&self.id) { - self.id = id.into(); - } - if let Some(id) = mask_header_utf8(&self.id) { - self.id = id.into(); - } - self +impl<'a> Sequence<'a> for (&'a [u8], &'a [u8]) { + fn sequence(&'a self) -> &'a [u8] { + &self.0 } +} - /// Return an iterator the returns valid kmers - pub fn kmers<'b, 'c>(&'b mut self, k: u8, canonical: bool) -> NuclKmer<'c> - where - 'b: 'c, - { - if canonical { - self.rev_seq = Some(self.seq.iter().rev().map(|n| complement(*n)).collect()); - } - match self.rev_seq { - Some(ref rev_seq) => NuclKmer::new(&self.seq, Some(&rev_seq), k), - None => NuclKmer::new(&self.seq, None, k), - } +impl<'a> QualitySequence<'a> for (&'a [u8], &'a [u8]) { + fn quality(&'a self) -> &'a [u8] { + &self.1 } +} - /// Return an iterator the returns valid kmers in 4-bit form - pub fn bit_kmers(&self, k: u8, canonical: bool) -> BitNuclKmer { - BitNuclKmer::new(&self.seq, k, canonical) - } +static EMPTY_VEC: &[u8] = b""; - /// Construct an owned version of `self` to, e.g. pass across threads - /// (it's not clear why this can't be the `impl for Clone`, but the - /// 'static lifetime doesn't work there for some reason) - pub fn into_owned(self) -> Sequence<'static> { - Sequence { - id: Cow::Owned(self.id.clone().into_owned()), - seq: Cow::Owned(self.seq.clone().into_owned()), - qual: self.qual.clone().map(Cow::into_owned).map(Cow::Owned), - rev_seq: self.rev_seq.clone(), +impl<'a> QualitySequence<'a> for SequenceRecord<'a> { + fn quality(&'a self) -> &'a [u8] { + if let Some(q) = self.qual.as_ref() { + q.as_ref() + } else { + &EMPTY_VEC } + // fake high quality scores? vec![b'I'; self.sequence().len()] } } +// +// /// Fixes up potential problems with sequence headers including tabs being +// /// present (may break downstream analyses with headers in TSVs) and with +// /// non-UTF8 characters being present, e.g. non-breaking spaces on Windows +// /// encodings (0x0A) breaks some tools. +// pub fn mask_header(mut self) -> Self { +// if let Some(id) = mask_header_tabs(&self.id) { +// self.id = id.into(); +// } +// if let Some(id) = mask_header_utf8(&self.id) { +// self.id = id.into(); +// } +// self +// } + #[test] fn test_quality_mask() { - let seq_rec = Sequence { - id: b""[..].into(), - // seq: Cow::Borrowed(&b"AGCT"[..]), - seq: b"AGCT"[..].into(), - qual: Some(b"AAA0"[..].into()), - rev_seq: None, - }; + let seq_rec = (&b"AGCT"[..], &b"AAA0"[..]); let filtered_rec = seq_rec.quality_mask(b'5'); - assert_eq!(&filtered_rec.seq[..], &b"AGCN"[..]); + assert_eq!(&filtered_rec[..], &b"AGCN"[..]); } #[test] fn can_kmerize() { // test general function - for (i, (_, k, _)) in Sequence::from_bytes(b"AGCT").kmers(1, false).enumerate() { + for (i, k) in b"AGCT".kmers(1).enumerate() { match i { 0 => assert_eq!(k, &b"A"[..]), 1 => assert_eq!(k, &b"G"[..]), @@ -244,28 +307,19 @@ fn can_kmerize() { } } - // test that we skip over N's - for (i, (_, k, _)) in Sequence::from_bytes(b"ACNGT").kmers(2, false).enumerate() { + // test that we handle length 2 (and don't drop Ns) + for (i, k) in b"ACNGT".kmers(2).enumerate() { match i { 0 => assert_eq!(k, &b"AC"[..]), - 1 => assert_eq!(k, &b"GT"[..]), - _ => unreachable!("Too many kmers"), - } - } - - // test that we skip over N's and handle short kmers - for (i, (ix, k, _)) in Sequence::from_bytes(b"ACNG").kmers(2, false).enumerate() { - match i { - 0 => { - assert_eq!(ix, 0); - assert_eq!(k, &b"AC"[..]); - } + 1 => assert_eq!(k, &b"CN"[..]), + 2 => assert_eq!(k, &b"NG"[..]), + 3 => assert_eq!(k, &b"GT"[..]), _ => unreachable!("Too many kmers"), } } // test that the minimum length works - for (_, k, _) in Sequence::from_bytes(b"AC").kmers(2, false) { + for k in b"AC".kmers(2) { assert_eq!(k, &b"AC"[..]); } } @@ -273,7 +327,11 @@ fn can_kmerize() { #[test] fn can_canonicalize() { // test general function - for (i, (_, k, is_c)) in Sequence::from_bytes(b"AGCT").kmers(1, true).enumerate() { + let seq = b"AGCT"; + for (i, (_, k, is_c)) in seq + .canonical_kmers(1, &seq.reverse_complement()) + .enumerate() + { match i { 0 => { assert_eq!(k, &b"A"[..]); @@ -295,7 +353,11 @@ fn can_canonicalize() { } } - for (i, (_, k, _)) in Sequence::from_bytes(b"AGCTA").kmers(2, true).enumerate() { + let seq = b"AGCTA"; + for (i, (_, k, _)) in seq + .canonical_kmers(2, &seq.reverse_complement()) + .enumerate() + { match i { 0 => assert_eq!(k, &b"AG"[..]), 1 => assert_eq!(k, &b"GC"[..]), @@ -305,7 +367,11 @@ fn can_canonicalize() { } } - for (i, (ix, k, _)) in Sequence::from_bytes(b"AGNTA").kmers(2, true).enumerate() { + let seq = b"AGNTA"; + for (i, (ix, k, _)) in seq + .canonical_kmers(2, &seq.reverse_complement()) + .enumerate() + { match i { 0 => { assert_eq!(ix, 0); diff --git a/src/util.rs b/src/util.rs index 2ce026f..10bab8f 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,10 +1,9 @@ -use std::borrow::Cow; use std::error; use std::fmt; use std::io; use std::str; -use memchr::{memchr2, memchr_iter}; +use memchr::memchr_iter; #[derive(Clone, Debug, PartialEq)] pub enum ParseErrorType { @@ -87,27 +86,6 @@ impl From for ParseError { } } -/// remove newlines from within FASTX records; currently the rate limiting step -/// in FASTX parsing (in general; readfq also exhibits this behavior) -#[inline] -pub fn strip_whitespace(seq: &[u8]) -> Cow<[u8]> { - let mut new_buf = Vec::with_capacity(seq.len()); - let mut i = 0; - while i < seq.len() { - match memchr2(b'\r', b'\n', &seq[i..]) { - None => { - new_buf.extend_from_slice(&seq[i..]); - break; - } - Some(match_pos) => { - new_buf.extend_from_slice(&seq[i..i + match_pos]); - i += match_pos + 1; - } - } - } - Cow::Owned(new_buf) -} - /// Like memchr, but handles a two-byte sequence (unlike memchr::memchr2, this /// looks for the bytes in sequence not either/or). ///