diff --git a/src/bitkmer.rs b/src/bitkmer.rs index b289a23..9ef88cd 100644 --- a/src/bitkmer.rs +++ b/src/bitkmer.rs @@ -3,19 +3,19 @@ pub type BitKmer = (BitKmerSeq, u8); /// Takes a BitKmer and adds a new base on the end, optionally loping off the /// first base if the resulting kmer is too long. -fn extend_kmer(kmer: &mut BitKmer, new_char: &u8) -> bool { +fn extend_kmer(kmer: &mut BitKmer, new_char: u8) -> bool { let new_char_int; match new_char { - &b'A' | &b'a' => new_char_int = 0 as BitKmerSeq, - &b'C' | &b'c' => new_char_int = 1 as BitKmerSeq, - &b'G' | &b'g' => new_char_int = 2 as BitKmerSeq, - &b'T' | &b't' => new_char_int = 3 as BitKmerSeq, + b'A' | b'a' => new_char_int = 0 as BitKmerSeq, + b'C' | b'c' => new_char_int = 1 as BitKmerSeq, + b'G' | b'g' => new_char_int = 2 as BitKmerSeq, + b'T' | b't' => new_char_int = 3 as BitKmerSeq, _ => return false, }; let new_kmer = (kmer.0 << 2) + new_char_int; // mask out any overflowed bits - kmer.0 = new_kmer & (BitKmerSeq::pow(2, (2 * kmer.1) as u32) - 1) as BitKmerSeq; + kmer.0 = new_kmer & (BitKmerSeq::pow(2, u32::from(2 * kmer.1)) - 1) as BitKmerSeq; true } @@ -30,16 +30,15 @@ fn update_position( return false; } - let mut kmer_len = (kmer.1 - 1) as usize; - let mut stop_len = kmer.1 as usize; - if initial { - kmer_len = 0; - stop_len = (kmer.1 - 1) as usize; - } + let (mut kmer_len, stop_len) = if initial { + (0, (kmer.1 - 1) as usize) + } else { + ((kmer.1 - 1) as usize, kmer.1 as usize) + }; let mut cur_kmer = kmer; while kmer_len < stop_len { - if extend_kmer(&mut cur_kmer, &buffer[*start_pos + kmer_len]) { + if extend_kmer(&mut cur_kmer, buffer[*start_pos + kmer_len]) { kmer_len += 1; } else { kmer_len = 0; @@ -67,10 +66,10 @@ impl<'a> BitNuclKmer<'a> { update_position(&mut start_pos, &mut kmer, slice, true); BitNuclKmer { - start_pos: start_pos, + start_pos, cur_kmer: kmer, buffer: slice, - canonical: canonical, + canonical, } } } @@ -159,15 +158,15 @@ pub fn reverse_complement(kmer: BitKmer) -> BitKmer { // inspired from https://www.biostars.org/p/113640/ let mut new_kmer = kmer.0; // reverse it - new_kmer = (new_kmer >> 2 & 0x3333333333333333) | (new_kmer & 0x3333333333333333) << 2; - new_kmer = (new_kmer >> 4 & 0x0F0F0F0F0F0F0F0F) | (new_kmer & 0x0F0F0F0F0F0F0F0F) << 4; - new_kmer = (new_kmer >> 8 & 0x00FF00FF00FF00FF) | (new_kmer & 0x00FF00FF00FF00FF) << 8; - new_kmer = (new_kmer >> 16 & 0x0000FFFF0000FFFF) | (new_kmer & 0x0000FFFF0000FFFF) << 16; - new_kmer = (new_kmer >> 32 & 0x00000000FFFFFFFF) | (new_kmer & 0x00000000FFFFFFFF) << 32; + new_kmer = (new_kmer >> 2 & 0x3333_3333_3333_3333) | (new_kmer & 0x3333_3333_3333_3333) << 2; + new_kmer = (new_kmer >> 4 & 0x0F0F_0F0F_0F0F_0F0F) | (new_kmer & 0x0F0F_0F0F_0F0F_0F0F) << 4; + new_kmer = (new_kmer >> 8 & 0x00FF_00FF_00FF_00FF) | (new_kmer & 0x00FF_00FF_00FF_00FF) << 8; + new_kmer = (new_kmer >> 16 & 0x0000_FFFF_0000_FFFF) | (new_kmer & 0x0000_FFFF_0000_FFFF) << 16; + new_kmer = (new_kmer >> 32 & 0x0000_0000_FFFF_FFFF) | (new_kmer & 0x0000_0000_FFFF_FFFF) << 32; // complement it - new_kmer ^= 0xFFFFFFFFFFFFFFFF; + new_kmer ^= 0xFFFF_FFFF_FFFF_FFFF; // shift it to the right size - new_kmer = new_kmer >> (2 * (32 - kmer.1)); + new_kmer >>= 2 * (32 - kmer.1); (new_kmer, kmer.1) } @@ -194,8 +193,8 @@ pub fn canonical(kmer: BitKmer) -> (BitKmer, bool) { pub fn minimizer(kmer: BitKmer, minmer_size: u8) -> BitKmer { let mut new_kmer = kmer.0; let mut lowest = !(0 as BitKmerSeq); - let bitmask = (BitKmerSeq::pow(2, (2 * minmer_size) as u32) - 1) as BitKmerSeq; - for _ in 0..(kmer.1 - minmer_size + 1) { + let bitmask = (BitKmerSeq::pow(2, u32::from(2 * minmer_size)) - 1) as BitKmerSeq; + for _ in 0..=(kmer.1 - minmer_size) { let cur = bitmask & new_kmer; if cur < lowest { lowest = cur; @@ -225,7 +224,7 @@ pub fn bitmer_to_bytes(kmer: BitKmer) -> Vec { // of the working buffer as we read them off "left to right") let offset = (kmer.1 - 1) * 2; let bitmask = - BitKmerSeq::pow(2, (2 * kmer.1 - 1) as u32) + BitKmerSeq::pow(2, (2 * kmer.1 - 2) as u32); + BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 1)) + BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 2)); for _ in 0..kmer.1 { let new_char = (new_kmer & bitmask) >> offset; @@ -253,7 +252,7 @@ pub fn bytes_to_bitmer(kmer: &[u8]) -> BitKmer { let mut bit_kmer = (0u64, k); for i in 0..k { - extend_kmer(&mut bit_kmer, &kmer[i as usize]); + extend_kmer(&mut bit_kmer, kmer[i as usize]); } bit_kmer } diff --git a/src/buffer.rs b/src/buffer.rs index 73a7c5a..2f6a02d 100644 --- a/src/buffer.rs +++ b/src/buffer.rs @@ -35,9 +35,9 @@ impl<'a> RecReader<'a> { } Ok(RecReader { - file: file, + file, last: false, - buf: buf, + buf, }) } @@ -63,7 +63,7 @@ impl<'a> RecReader<'a> { Ok(false) } - pub fn get_buffer<'b, T>(&'b self, record_count: usize) -> RecBuffer<'b, T> { + pub fn get_buffer(&self, record_count: usize) -> RecBuffer { RecBuffer { buf: &self.buf, pos: 0, @@ -104,11 +104,6 @@ fn test_from_bytes() { assert_eq!(rb.buf, b"test"); } -pub trait FindRecord { - fn move_to_next(&mut self); - fn is_finished(&self) -> bool; -} - // pub fn parse(reader: &'s mut io::Read, header: &[u8], ref mut callback: F) -> Result<(), E> where // E: From, // F: FnMut(T) -> Result<(), E>, diff --git a/src/fastx.rs b/src/fastx.rs index 1263461..a790991 100644 --- a/src/fastx.rs +++ b/src/fastx.rs @@ -21,7 +21,7 @@ use std::str; use memchr::memchr; -use crate::buffer::{FindRecord, RecBuffer, RecReader}; +use crate::buffer::{RecBuffer, RecReader}; use crate::seq::SeqRecord; use crate::util::{memchr_both, strip_whitespace, ParseError, ParseErrorType}; @@ -44,6 +44,7 @@ struct FASTA<'a> { struct FASTQ<'a> { id: &'a str, seq: &'a [u8], + id2: &'a [u8], qual: &'a [u8], } @@ -52,7 +53,7 @@ impl<'a> Iterator for RecBuffer<'a, FASTA<'static>> { fn next(&mut self) -> Option { let buf = &self.buf[self.pos..]; - if buf.len() == 0 { + if buf.is_empty() { return None; } @@ -62,7 +63,7 @@ impl<'a> Iterator for RecBuffer<'a, FASTA<'static>> { None => return None, }; let mut raw_id = &buf[1..id_end - 1]; - if raw_id.len() > 0 && raw_id[raw_id.len() - 1] == b'\r' { + if !raw_id.is_empty() && raw_id[raw_id.len() - 1] == b'\r' { raw_id = &raw_id[..raw_id.len() - 1]; } let id; @@ -92,7 +93,7 @@ impl<'a> Iterator for RecBuffer<'a, FASTA<'static>> { self.pos += seq_end; self.count += 1; - Some(Ok(FASTA { id: id, seq: seq })) + Some(Ok(FASTA { id, seq })) } } @@ -138,6 +139,7 @@ impl<'a> Iterator for RecBuffer<'a, FASTQ<'a>> { Some(i) => id2_end = seq_end + i + 1, None => return None, }; + let id2 = &buf[seq_end..id2_end - 1]; // we know the qual scores must be the same length as the sequence // so we can just do some arithmatic instead of memchr'ing @@ -150,7 +152,7 @@ impl<'a> Iterator for RecBuffer<'a, FASTQ<'a>> { } // now do some math to figure out if the file doesn't end with a newline let windows_ending = if seq.last() == Some(&b'\r') { 1 } else { 0 }; - if !(qual_end == buf.len() + 1 + windows_ending) { + if qual_end != buf.len() + 1 + windows_ending { return None; } buffer_used -= 1 + windows_ending; @@ -159,12 +161,12 @@ impl<'a> Iterator for RecBuffer<'a, FASTQ<'a>> { let mut qual = &buf[id2_end..qual_end - 1]; // clean up any extra '\r' from the id and seq - if raw_id.len() > 0 && raw_id[raw_id.len() - 1] == b'\r' { + if !raw_id.is_empty() && raw_id[raw_id.len() - 1] == b'\r' { raw_id = &raw_id[..raw_id.len() - 1]; seq = &seq[..seq.len() - 1]; } // we do qual separately in case this is the end of the file - if qual.len() > 0 && qual[qual.len() - 1] == b'\r' { + if !qual.is_empty() && qual[qual.len() - 1] == b'\r' { qual = &qual[..qual.len() - 1]; } @@ -180,11 +182,7 @@ impl<'a> Iterator for RecBuffer<'a, FASTQ<'a>> { } self.pos += buffer_used; self.count += 1; - Some(Ok(FASTQ { - id: id, - seq: seq, - qual: qual, - })) + Some(Ok(FASTQ { id, seq, id2, qual })) } } @@ -203,29 +201,9 @@ fn is_finished(rb: &RecBuffer) -> bool { true } -impl<'a> FindRecord for RecBuffer<'a, FASTA<'a>> { - fn move_to_next(&mut self) { - unimplemented!(""); - } - - fn is_finished(&self) -> bool { - is_finished(&self) - } -} - -impl<'a> FindRecord for RecBuffer<'a, FASTQ<'a>> { - fn move_to_next(&mut self) { - unimplemented!(""); - } - - fn is_finished(&self) -> bool { - is_finished(&self) - } -} - impl<'a> From> for SeqRecord<'a> { fn from(fasta: FASTA<'a>) -> SeqRecord<'a> { - SeqRecord::new(fasta.id, Cow::from(strip_whitespace(fasta.seq)), None) + SeqRecord::new(fasta.id, strip_whitespace(fasta.seq), None) } } @@ -239,7 +217,7 @@ impl<'a> From> for SeqRecord<'a> { fn fastx_reader( reader: &mut R, first_byte: Option, - ref mut callback: F, + mut callback: F, type_callback: Option<&mut T>, ) -> Result<(), ParseError> where @@ -251,58 +229,50 @@ where match first_byte { Some(b) => first[0] = b, None => { - reader.read(&mut first)?; + reader.read_exact(&mut first)?; }, } + if let Some(f) = type_callback { + match first[0] { + b'>' => f("FASTA"), + b'@' => f("FASTQ"), + _ => (), + } + } let mut rec_reader = RecReader::new(reader, 10_000_000, &first)?; let mut record_count = 0; - match first[0] { - b'>' => { - if let Some(f) = type_callback { - f("FASTA"); - } - loop { - let used = { - let mut rec_buffer = rec_reader.get_buffer::(record_count); - for s in rec_buffer.by_ref() { - callback(SeqRecord::from(s?)); - } - record_count += rec_buffer.count; - rec_buffer.pos - }; - if rec_reader.refill(used)? { - break; + loop { + let used = match first[0] { + b'>' => { + let mut rec_buffer = rec_reader.get_buffer::(record_count); + for s in rec_buffer.by_ref() { + callback(SeqRecord::from(s?)); } - } - }, - b'@' => { - if let Some(f) = type_callback { - f("FASTQ"); - } - loop { - let used = { - let mut rec_buffer = rec_reader.get_buffer::(record_count); - for s in rec_buffer.by_ref() { - callback(SeqRecord::from(s?)); - } - record_count += rec_buffer.count; - rec_buffer.pos - }; - if rec_reader.refill(used)? { - break; + record_count += rec_buffer.count; + rec_buffer.pos + }, + b'@' => { + let mut rec_buffer = rec_reader.get_buffer::(record_count); + for s in rec_buffer.by_ref() { + callback(SeqRecord::from(s?)); } - } - }, - _ => { - return Err(ParseError::new( - "Bad starting byte", - ParseErrorType::InvalidHeader, - )) - }, - }; + record_count += rec_buffer.count; + rec_buffer.pos + }, + _ => { + return Err(ParseError::new( + "Bad starting byte", + ParseErrorType::InvalidHeader, + )) + }, + }; + if rec_reader.refill(used)? { + break; + } + } // check if there's anything left stuff in the buffer (besides returns) let rec_buffer = rec_reader.get_buffer::(record_count); - if !rec_buffer.is_finished() { + if !is_finished(&rec_buffer) { return Err(ParseError::new( "File ended abruptly", ParseErrorType::PrematureEOF, @@ -312,7 +282,7 @@ where } /// Parse a array of bytes into FASTX records and calls `callback` on each. -pub fn fastx_bytes<'b, F>(bytes: &'b [u8], ref mut callback: F) -> Result<(), ParseError> +pub fn fastx_bytes<'b, F>(bytes: &'b [u8], callback: F) -> Result<(), ParseError> where F: for<'a> FnMut(SeqRecord<'a>) -> (), { @@ -328,8 +298,8 @@ where #[cfg(feature = "compression")] pub fn fastx_stream( mut reader: R, - ref mut type_callback: T, - ref mut callback: F, + mut type_callback: T, + callback: F, ) -> Result<(), ParseError> where F: for<'a> FnMut(SeqRecord<'a>) -> (), @@ -340,10 +310,10 @@ where //! that gets called as soon as we determine if the records are FASTA or FASTQ. //! If a file starts with a gzip or other header, transparently decompress it. let mut first = vec![0]; - reader.read(&mut first)?; + reader.read_exact(&mut first)?; if first[0] == 0x1F { // gz files - reader.read(&mut first)?; + reader.read_exact(&mut first)?; if first[0] != 0x8B { return Err(ParseError::new( "Bad gz header", @@ -352,10 +322,10 @@ where } let _ = reader.seek(SeekFrom::Start(0)); let mut gz_reader = MultiGzDecoder::new(reader); - fastx_reader(&mut gz_reader, None, callback, Some(type_callback)) + fastx_reader(&mut gz_reader, None, callback, Some(&mut type_callback)) } else if first[0] == 0x42 { // bz files - reader.read(&mut first)?; + reader.read_exact(&mut first)?; if first[0] != 0x5A { return Err(ParseError::new( "Bad bz header", @@ -364,10 +334,10 @@ where } let _ = reader.seek(SeekFrom::Start(0)); let mut bz_reader = BzDecoder::new(reader); - fastx_reader(&mut bz_reader, None, callback, Some(type_callback)) + fastx_reader(&mut bz_reader, None, callback, Some(&mut type_callback)) } else if first[0] == 0xFD { // xz files - reader.read(&mut first)?; + reader.read_exact(&mut first)?; if first[0] != 0x37 { return Err(ParseError::new( "Bad xz header", @@ -376,10 +346,10 @@ where } let _ = reader.seek(SeekFrom::Start(0)); let mut xz_reader = XzDecoder::new(reader); - fastx_reader(&mut xz_reader, None, callback, Some(type_callback)) + fastx_reader(&mut xz_reader, None, callback, Some(&mut type_callback)) } else if first[0] == 0x50 { // zip files - reader.read(&mut first)?; + reader.read_exact(&mut first)?; if first[0] != 0x4b { return Err(ParseError::new( "Bad zip header", @@ -397,17 +367,17 @@ where )); } let mut zip_reader = zip_archive.by_index(0)?; - fastx_reader(&mut zip_reader, None, callback, Some(type_callback)) + fastx_reader(&mut zip_reader, None, callback, Some(&mut type_callback)) } else { - fastx_reader(&mut reader, Some(first[0]), callback, Some(type_callback)) + fastx_reader(&mut reader, Some(first[0]), callback, Some(&mut type_callback)) } } #[cfg(feature = "compression")] pub fn fastx_cli( filename: &str, - ref mut type_callback: T, - ref mut callback: F, + mut type_callback: T, + callback: F, ) -> Result<(), ParseError> where F: for<'a> FnMut(SeqRecord<'a>) -> (), @@ -419,7 +389,7 @@ where if filename == "-" { let sin = stdin(); let mut lock = sin.lock(); - return fastx_reader(&mut lock, None, callback, Some(type_callback)); + return fastx_reader(&mut lock, None, callback, Some(&mut type_callback)); } let f = File::open(&Path::new(filename))?; @@ -791,7 +761,7 @@ fn test_fastq_across_buffer() { let used = { let mut rec_buffer = rec_reader.get_buffer::(0); - for s in rec_buffer.by_ref() { + for _s in rec_buffer.by_ref() { // record is incomplete panic!("No initial record should be parsed") } diff --git a/src/kmer.rs b/src/kmer.rs index 8a3beff..e299272 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -5,123 +5,57 @@ use std::borrow::Cow; #[inline] -pub fn complement(n: &u8) -> u8 { +pub fn complement(n: u8) -> u8 { //! Returns the complementary base for a given IUPAC base code. //! //! Does not work for RNA sequences (maybe we should raise an error or something?) - match *n as char { - 'a' => 't' as u8, - 'A' => 'T' as u8, - 'c' => 'g' as u8, - 'C' => 'G' as u8, - 'g' => 'c' as u8, - 'G' => 'C' as u8, - 't' => 'a' as u8, - 'T' => 'A' as u8, + match n { + b'a' => b't', + b'A' => b'T', + b'c' => b'g', + b'C' => b'G', + b'g' => b'c', + b'G' => b'C', + b't' => b'a', + b'T' => b'A', // IUPAC codes - 'r' => 'y' as u8, - 'y' => 'r' as u8, - 'k' => 'm' as u8, - 'm' => 'k' as u8, - 'b' => 'v' as u8, - 'v' => 'b' as u8, - 'd' => 'h' as u8, - 'h' => 'd' as u8, - 's' => 's' as u8, - 'w' => 'w' as u8, - 'R' => 'Y' as u8, - 'Y' => 'R' as u8, - 'K' => 'M' as u8, - 'M' => 'K' as u8, - 'B' => 'V' as u8, - 'V' => 'B' as u8, - 'D' => 'H' as u8, - 'H' => 'D' as u8, - 'S' => 'S' as u8, - 'W' => 'W' as u8, + b'r' => b'y', + b'y' => b'r', + b'k' => b'm', + b'm' => b'k', + b'b' => b'v', + b'v' => b'b', + b'd' => b'h', + b'h' => b'd', + b's' => b's', + b'w' => b'w', + b'R' => b'Y', + b'Y' => b'R', + b'K' => b'M', + b'M' => b'K', + b'B' => b'V', + b'V' => b'B', + b'D' => b'H', + b'H' => b'D', + b'S' => b'S', + b'W' => b'W', // anything else just pass through // 'u' | 'U' => panic!("Does not support complements of U"), - x => x as u8, + x => x, } } #[test] fn test_complement() { - assert_eq!(complement(&b'a'), b't'); - assert_eq!(complement(&b'c'), b'g'); - assert_eq!(complement(&b'g'), b'c'); - assert_eq!(complement(&b'n'), b'n'); + assert_eq!(complement(b'a'), b't'); + assert_eq!(complement(b'c'), b'g'); + assert_eq!(complement(b'g'), b'c'); + assert_eq!(complement(b'n'), b'n'); } -pub fn normalize<'a>(seq: &'a [u8], iupac: bool) -> Vec { - //! Transform a FASTX sequence into it's "normalized" form. - //! - //! The normalized form is: - //! - only AGCTN and possibly . (for gaps) - //! - lowercase versions of these are uppercased - //! - U is converted to T (make everything a DNA sequence) - //! - some other punctuation is converted to gaps - //! - IUPAC bases may be converted to N's depending on the parameter passed in - //! - everything else is considered a N - let mut buf: Vec = Vec::with_capacity(seq.len()); - - for n in seq.iter() { - buf.push(match (*n as char, iupac) { - c @ ('A', _) - | c @ ('C', _) - | c @ ('G', _) - | c @ ('T', _) - | c @ ('N', _) - | c @ ('.', _) => c.0 as u8, - ('a', _) => 'A' as u8, - ('c', _) => 'C' as u8, - ('g', _) => 'G' as u8, - // normalize uridine to thymine - ('t', _) | ('u', _) | ('U', _) => 'T' as u8, - ('-', _) | ('~', _) | (' ', _) => '.' as u8, - // logic for IUPAC bases (a little messy) - c @ ('B', true) - | c @ ('D', true) - | c @ ('H', true) - | c @ ('V', true) - | c @ ('R', true) - | c @ ('Y', true) - | c @ ('S', true) - | c @ ('W', true) - | c @ ('K', true) - | c @ ('M', true) => c.0 as u8, - ('b', true) => 'B' as u8, - ('d', true) => 'D' as u8, - ('h', true) => 'H' as u8, - ('v', true) => 'V' as u8, - ('r', true) => 'R' as u8, - ('y', true) => 'Y' as u8, - ('s', true) => 'S' as u8, - ('w', true) => 'W' as u8, - ('k', true) => 'K' as u8, - ('m', true) => 'M' as u8, - _ => 'N' as u8, - }); - } - buf -} - -#[test] -fn test_normalize() { - assert_eq!(normalize(b"ACGTU", false), b"ACGTT"); - assert_eq!(normalize(b"acgtu", false), b"ACGTT"); - - assert_eq!(normalize(b"N.N-N~N N", false), b"N.N.N.N.N"); - - assert_eq!(normalize(b"BDHVRYSWKM", true), b"BDHVRYSWKM"); - assert_eq!(normalize(b"bdhvryswkm", true), b"BDHVRYSWKM"); - assert_eq!(normalize(b"BDHVRYSWKM", false), b"NNNNNNNNNN"); - assert_eq!(normalize(b"bdhvryswkm", false), b"NNNNNNNNNN"); -} - -pub fn canonical<'a>(seq: &'a [u8]) -> Cow<'a, [u8]> { +pub fn canonical(seq: &[u8]) -> Cow<[u8]> { //! Taking in a sequence string, return the canonical form of the sequence //! (e.g. the lexigraphically lowest of either the original sequence or its //! reverse complement) @@ -131,7 +65,7 @@ pub fn canonical<'a>(seq: &'a [u8]) -> Cow<'a, [u8]> { let mut original_was_canonical = false; // loop through the kmer and its reverse complement simultaneously - for (rn, n) in seq.iter().rev().map(|n| complement(n)).zip(seq.iter()) { + for (rn, n) in seq.iter().rev().map(|n| complement(*n)).zip(seq.iter()) { buf.push(rn); if !enough && n < &rn { original_was_canonical = true; @@ -143,10 +77,10 @@ pub fn canonical<'a>(seq: &'a [u8]) -> Cow<'a, [u8]> { } match (original_was_canonical, enough) { (true, true) => panic!("Bug: should never set original_was_canonical if enough == true"), - (true, false) => Cow::Borrowed(seq), - (false, true) => Cow::Owned(buf), + (true, false) => seq.into(), + (false, true) => buf.into(), // the sequences were completely equal, return the ref - (false, false) => Cow::Borrowed(seq), + (false, false) => seq.into(), } } @@ -162,16 +96,16 @@ fn can_canonicalize() { /// Find the lexigraphically smallest substring of `seq` of length `length` /// /// There's probably a faster algorithm for this somewhere... -pub fn minimizer<'a>(seq: &'a [u8], length: usize) -> Cow<'a, [u8]> { - let reverse_complement: Vec = seq.iter().rev().map(|n| complement(n)).collect(); +pub fn minimizer(seq: &[u8], length: usize) -> Cow<[u8]> { + let reverse_complement: Vec = seq.iter().rev().map(|n| complement(*n)).collect(); let mut minmer = Cow::Borrowed(&seq[..length]); for (kmer, rc_kmer) in seq.windows(length).zip(reverse_complement.windows(length)) { if kmer < &minmer[..] { - minmer = Cow::Borrowed(kmer); + minmer = kmer.into(); } if rc_kmer < &minmer[..] { - minmer = Cow::Owned(rc_kmer.to_vec()); + minmer = rc_kmer.to_vec().into(); } } minmer @@ -183,11 +117,6 @@ fn can_minimize() { assert_eq!(&minmer[..], b"AAA"); } -// TODO -// pub fn skip_n<'a, T>(iter: T) -> T where T: Iterator { -// iter.filter(|kmer| kmer.contains(&('N' as u8)) || kmer.contains(&('n' as u8))) -// } - pub fn is_good_base(chr: u8) -> bool { match chr as char { 'a' | 'c' | 'g' | 't' | 'A' | 'C' | 'G' | 'T' => true, @@ -195,14 +124,75 @@ pub fn is_good_base(chr: u8) -> bool { } } -pub fn has_no_n<'a>(seq: &'a [u8]) -> bool { - //! Determines if a sequence has any non-primary four bases - //! characters in it - seq.iter().all(|n| is_good_base(*n)) +pub struct NuclKmer<'a> { + k: u8, + start_pos: usize, + buffer: &'a [u8], + rc_buffer: Option<&'a [u8]>, } -#[test] -fn can_detect_no_n() { - assert!(has_no_n(b"AAGT")); - assert!(!has_no_n(b"NAGT")); +fn update_position(start_pos: &mut usize, k: u8, buffer: &[u8], initial: bool) -> bool { + // check if we have enough "physical" space for one more kmer + if *start_pos + k as usize > buffer.len() { + return false; + } + + let (mut kmer_len, stop_len) = if initial { + (0, (k - 1) as usize) + } else { + ((k - 1) as usize, k as usize) + }; + + while kmer_len < stop_len { + if is_good_base(buffer[*start_pos + kmer_len]) { + kmer_len += 1; + } else { + kmer_len = 0; + *start_pos += kmer_len + 1; + if *start_pos + k as usize > buffer.len() { + return false; + } + } + } + true +} + +impl<'a> NuclKmer<'a> { + //! A kmer-izer for a nucleotide/amino acid sequence; returning slices to the original data + pub fn new(buffer: &'a [u8], rc_buffer: Option<&'a [u8]>, k: u8) -> NuclKmer<'a> { + let mut start_pos = 0; + update_position(&mut start_pos, k, buffer, true); + NuclKmer { + k, + start_pos, + buffer, + rc_buffer, + } + } +} + +impl<'a> Iterator for NuclKmer<'a> { + type Item = (usize, &'a [u8], bool); + + fn next(&mut self) -> Option<(usize, &'a [u8], bool)> { + if !update_position(&mut self.start_pos, self.k, self.buffer, false) { + return None; + } + let pos = self.start_pos; + self.start_pos += 1; + + let result = &self.buffer[pos..pos + self.k as usize]; + match self.rc_buffer { + None => Some((pos, result, false)), + Some(rc_buffer) => { + let rc_result = + &rc_buffer[rc_buffer.len() - pos - self.k as usize..rc_buffer.len() - pos]; + if result < rc_result { + Some((pos, result, false)) + } else { + Some((pos, rc_result, true)) + } + }, + } + } } diff --git a/src/seq.rs b/src/seq.rs index 5d784a2..2d47c94 100644 --- a/src/seq.rs +++ b/src/seq.rs @@ -1,11 +1,98 @@ use std::borrow::Cow; +use memchr::memchr; + use crate::bitkmer::BitNuclKmer; -use crate::kmer::{complement, is_good_base, normalize}; +use crate::kmer::{complement, NuclKmer}; + +pub fn normalize(seq: &[u8], iupac: bool) -> (Vec, bool) { + //! Transform a FASTX sequence into it's "normalized" form. + //! + //! The normalized form is: + //! - only AGCTN and possibly . (for gaps) + //! - lowercase versions of these are uppercased + //! - U is converted to T (make everything a DNA sequence) + //! - some other punctuation is converted to gaps + //! - IUPAC bases may be converted to N's depending on the parameter passed in + //! - everything else is considered a N + let mut buf: Vec = Vec::with_capacity(seq.len()); + let mut changed: bool = false; + + for n in seq.iter() { + let (new_char, char_changed) = match (*n, iupac) { + c @ (b'A', _) + | c @ (b'C', _) + | c @ (b'G', _) + | c @ (b'T', _) + | c @ (b'N', _) + | c @ (b'.', _) => (c.0, false), + (b'a', _) => (b'A', true), + (b'c', _) => (b'C', true), + (b'g', _) => (b'G', true), + // normalize uridine to thymine + (b't', _) | (b'u', _) | (b'U', _) => (b'T', true), + // normalize gaps + (b'-', _) | (b'~', _) | (b' ', _) => (b'.', true), + // logic for IUPAC bases (a little messy) + c @ (b'B', true) + | c @ (b'D', true) + | c @ (b'H', true) + | c @ (b'V', true) + | c @ (b'R', true) + | c @ (b'Y', true) + | c @ (b'S', true) + | c @ (b'W', true) + | c @ (b'K', true) + | c @ (b'M', true) => (c.0, false), + (b'b', true) => (b'B', true), + (b'd', true) => (b'D', true), + (b'h', true) => (b'H', true), + (b'v', true) => (b'V', true), + (b'r', true) => (b'R', true), + (b'y', true) => (b'Y', true), + (b's', true) => (b'S', true), + (b'w', true) => (b'W', true), + (b'k', true) => (b'K', true), + (b'm', true) => (b'M', true), + _ => (b'N', true), + }; + changed = changed || char_changed; + buf.push(new_char); + } + (buf, changed) +} + +#[test] +fn test_normalize() { + assert_eq!(normalize(b"ACGTU", false), (b"ACGTT".to_vec(), true)); + assert_eq!(normalize(b"acgtu", false), (b"ACGTT".to_vec(), true)); + + assert_eq!( + normalize(b"N.N-N~N N", false), + (b"N.N.N.N.N".to_vec(), true) + ); + + assert_eq!( + normalize(b"BDHVRYSWKM", true), + (b"BDHVRYSWKM".to_vec(), false) + ); + assert_eq!( + normalize(b"bdhvryswkm", true), + (b"BDHVRYSWKM".to_vec(), true) + ); + assert_eq!( + normalize(b"BDHVRYSWKM", false), + (b"NNNNNNNNNN".to_vec(), true) + ); + assert_eq!( + normalize(b"bdhvryswkm", false), + (b"NNNNNNNNNN".to_vec(), true) + ); +} /// A generic FASTX record that also abstracts over several logical operations /// that can be performed on nucleic acid sequences. -#[derive(Debug)] +#[derive(Clone, Debug)] pub struct SeqRecord<'a> { pub id: Cow<'a, str>, pub seq: Cow<'a, [u8]>, @@ -16,8 +103,8 @@ pub struct SeqRecord<'a> { impl<'a> SeqRecord<'a> { pub fn new(id: &'a str, seq: Cow<'a, [u8]>, qual: Option<&'a [u8]>) -> Self { SeqRecord { - id: Cow::Borrowed(id), - seq: seq, + id: id.into(), + seq, qual: qual.map(Cow::Borrowed), rev_seq: None, } @@ -25,8 +112,8 @@ impl<'a> SeqRecord<'a> { pub fn from_bytes(seq: &'a [u8]) -> Self { SeqRecord { - id: Cow::Borrowed(""), - seq: Cow::Borrowed(seq), + id: "".into(), + seq: seq.into(), qual: None, rev_seq: None, } @@ -36,7 +123,7 @@ impl<'a> SeqRecord<'a> { /// `N` characters. /// /// Experimental. - pub fn quality_mask(self, ref score: u8) -> Self { + pub fn quality_mask(self, score: u8) -> Self { if self.qual == None { return self; } @@ -49,30 +136,40 @@ impl<'a> SeqRecord<'a> { .zip(qual.iter()) .map( |(base, qual)| { - if qual < score { + if *qual < score { b'N' } else { - base.clone() + *base } }, ) .collect(); SeqRecord { id: self.id, - seq: seq, + seq, qual: Some(Cow::Owned(qual)), rev_seq: None, } } /// Capitalize everything and mask unknown bases to N - pub fn normalize(self, iupac: bool) -> Self { - let seq = normalize(&self.seq, iupac); - SeqRecord { - id: self.id, - seq: Cow::Owned(seq), - qual: self.qual, - rev_seq: None, + pub fn normalize(&'a mut self, iupac: bool) -> bool { + let (seq, changed) = normalize(&self.seq, iupac); + if changed { + self.seq = seq.into(); + } + changed + } + + /// Mask tabs in header lines to `|`s + /// + /// Returns `true` if the header was masked + pub fn mask_header(&mut self) -> bool { + if memchr(b'\t', self.id.as_ref().as_bytes()).is_some() { + self.id = self.id.as_ref().replace("\t", "|").into(); + true + } else { + false } } @@ -82,7 +179,7 @@ impl<'a> SeqRecord<'a> { 'b: 'c, { if canonical { - self.rev_seq = Some(self.seq.iter().rev().map(|n| complement(n)).collect()); + self.rev_seq = Some(self.seq.iter().rev().map(|n| complement(*n)).collect()); } match self.rev_seq { Some(ref rev_seq) => NuclKmer::new(&self.seq, Some(&rev_seq), k), @@ -91,7 +188,7 @@ impl<'a> SeqRecord<'a> { } /// Return an iterator the returns valid kmers in 4-bit form - pub fn bit_kmers<'b>(&'b self, k: u8, canonical: bool) -> BitNuclKmer<'b> { + pub fn bit_kmers(&self, k: u8, canonical: bool) -> BitNuclKmer { BitNuclKmer::new(&self.seq, k, canonical) } @@ -108,86 +205,13 @@ impl<'a> SeqRecord<'a> { } } -pub struct NuclKmer<'a> { - k: u8, - start_pos: usize, - buffer: &'a [u8], - rc_buffer: Option<&'a [u8]>, -} - -fn update_position(start_pos: &mut usize, k: u8, buffer: &[u8], initial: bool) -> bool { - // check if we have enough "physical" space for one more kmer - if *start_pos + k as usize > buffer.len() { - return false; - } - - let mut kmer_len = (k - 1) as usize; - let mut stop_len = k as usize; - if initial { - kmer_len = 0; - stop_len = (k - 1) as usize; - } - - while kmer_len < stop_len { - if is_good_base(buffer[*start_pos + kmer_len]) { - kmer_len += 1; - } else { - kmer_len = 0; - *start_pos += kmer_len + 1; - if *start_pos + k as usize > buffer.len() { - return false; - } - } - } - true -} - -impl<'a> NuclKmer<'a> { - //! A kmer-izer for a nucleotide/amino acid sequence; returning slices to the original data - pub fn new(buffer: &'a [u8], rc_buffer: Option<&'a [u8]>, k: u8) -> NuclKmer<'a> { - let mut start_pos = 0; - update_position(&mut start_pos, k, buffer, true); - NuclKmer { - k: k, - start_pos: start_pos, - buffer: buffer, - rc_buffer: rc_buffer, - } - } -} - -impl<'a> Iterator for NuclKmer<'a> { - type Item = (usize, &'a [u8], bool); - - fn next(&mut self) -> Option<(usize, &'a [u8], bool)> { - if !update_position(&mut self.start_pos, self.k, self.buffer, false) { - return None; - } - let pos = self.start_pos; - self.start_pos += 1; - - let result = &self.buffer[pos..pos + self.k as usize]; - match self.rc_buffer { - None => Some((pos, result, false)), - Some(rc_buffer) => { - let rc_result = - &rc_buffer[rc_buffer.len() - pos - self.k as usize..rc_buffer.len() - pos]; - if result < rc_result { - Some((pos, result, false)) - } else { - Some((pos, rc_result, true)) - } - }, - } - } -} - #[test] fn test_quality_mask() { let seq_rec = SeqRecord { - id: Cow::Borrowed(""), - seq: Cow::Borrowed(&b"AGCT"[..]), - qual: Some(Cow::Borrowed(&b"AAA0"[..])), + id: "".into(), + // seq: Cow::Borrowed(&b"AGCT"[..]), + seq: b"AGCT"[..].into(), + qual: Some(b"AAA0"[..].into()), rev_seq: None, }; let filtered_rec = seq_rec.quality_mask('5' as u8); diff --git a/src/util.rs b/src/util.rs index 80fc491..8a71320 100644 --- a/src/util.rs +++ b/src/util.rs @@ -100,7 +100,7 @@ impl From for ParseError { /// remove newlines from within FASTX records; currently the rate limiting step /// in FASTX parsing (in general; readfq also exhibits this behavior) #[inline] -pub fn strip_whitespace<'a>(seq: &'a [u8]) -> Cow<'a, [u8]> { +pub fn strip_whitespace(seq: &[u8]) -> Cow<[u8]> { let mut new_buf = Vec::with_capacity(seq.len()); let mut i = 0; while i < seq.len() {