From 0587cb3e6fb3b08ed10161895edfa42994e83f68 Mon Sep 17 00:00:00 2001 From: Roderick Bovee Date: Thu, 22 Aug 2019 11:35:20 -0700 Subject: [PATCH] Unify FASTX parsing functions & allow streaming gz/bz/etc --- Cargo.toml | 4 +- src/bitkmer.rs | 4 +- src/buffer.rs | 4 +- src/fastx.rs | 860 ++++++++++++++++---------------- src/kmer.rs | 8 +- src/lib.rs | 13 +- src/seq.rs | 20 +- src/util.rs | 12 +- {benches => tests}/benchmark.rs | 66 +-- tests/data/test.fa.zip | Bin 183 -> 0 bytes 10 files changed, 490 insertions(+), 501 deletions(-) rename {benches => tests}/benchmark.rs (69%) delete mode 100644 tests/data/test.fa.zip diff --git a/Cargo.toml b/Cargo.toml index cabd433..31d355b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,18 +12,18 @@ edition = "2018" [features] default = ["compression"] -compression = ["bzip2", "flate2", "xz2", "zip"] +compression = ["bzip2", "flate2", "xz2"] [dependencies] flate2 = { version="1.0.6", optional=true } bzip2 = { version="0.3.3", optional=true } xz2 = { version="0.1.6", optional=true } -zip = { version="0.5.0", optional=true } memchr = "2.2.0" [dev-dependencies] bencher = "0.1.5" [[bench]] +path = "tests/benchmark.rs" name = "benchmark" harness = false diff --git a/src/bitkmer.rs b/src/bitkmer.rs index 9ef88cd..00411b4 100644 --- a/src/bitkmer.rs +++ b/src/bitkmer.rs @@ -223,8 +223,8 @@ pub fn bitmer_to_bytes(kmer: BitKmer) -> Vec { // math to figure out where they start (this helps us just pop the bases on the end // of the working buffer as we read them off "left to right") let offset = (kmer.1 - 1) * 2; - let bitmask = - BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 1)) + BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 2)); + let bitmask = BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 1)) + + BitKmerSeq::pow(2, u32::from(2 * kmer.1 - 2)); for _ in 0..kmer.1 { let new_char = (new_kmer & bitmask) >> offset; diff --git a/src/buffer.rs b/src/buffer.rs index 2f6a02d..9439764 100644 --- a/src/buffer.rs +++ b/src/buffer.rs @@ -4,7 +4,7 @@ use std::marker::PhantomData; use crate::util::ParseError; pub struct RecReader<'a> { - file: &'a mut io::Read, + file: &'a mut dyn io::Read, last: bool, pub buf: Vec, } @@ -20,7 +20,7 @@ impl<'a> RecReader<'a> { /// Under some very rare circumstances (setting a `buf_size` larger than 2 Gb /// on Mac OS X) a panic can occur. Please use a smaller buffer in this case. pub fn new( - file: &'a mut io::Read, + file: &'a mut dyn io::Read, buf_size: usize, header: &[u8], ) -> Result, ParseError> { diff --git a/src/fastx.rs b/src/fastx.rs index a790991..3d88b62 100644 --- a/src/fastx.rs +++ b/src/fastx.rs @@ -14,25 +14,21 @@ use std::borrow::Cow; use std::cmp::min; -use std::fs::File; -use std::io::{stdin, Cursor, Read, Seek, SeekFrom}; -use std::path::Path; +use std::io::{Cursor, Read}; use std::str; use memchr::memchr; -use crate::buffer::{RecBuffer, RecReader}; -use crate::seq::SeqRecord; -use crate::util::{memchr_both, strip_whitespace, ParseError, ParseErrorType}; - #[cfg(feature = "compression")] use bzip2::read::BzDecoder; #[cfg(feature = "compression")] use flate2::read::MultiGzDecoder; #[cfg(feature = "compression")] use xz2::read::XzDecoder; -#[cfg(feature = "compression")] -use zip::read::ZipArchive; + +use crate::buffer::{RecBuffer, RecReader}; +use crate::seq::SeqRecord; +use crate::util::{memchr_both, strip_whitespace, ParseError, ParseErrorType}; #[derive(Debug)] struct FASTA<'a> { @@ -84,7 +80,10 @@ impl<'a> Iterator for RecBuffer<'a, FASTA<'static>> { (None, false) => return None, }; if id_end == seq_end { - return Some(Err(ParseError::PrematureEOF)) + return Some(Err(ParseError::new( + "Sequence completely empty", + ParseErrorType::PrematureEOF, + ))); } let mut seq = &buf[id_end..seq_end]; if seq[seq.len() - 1] == b'\r' { @@ -216,9 +215,8 @@ impl<'a> From> for SeqRecord<'a> { /// Internal function abstracting over byte and file FASTX parsing fn fastx_reader( reader: &mut R, - first_byte: Option, mut callback: F, - type_callback: Option<&mut T>, + type_callback: &mut T, ) -> Result<(), ParseError> where F: for<'a> FnMut(SeqRecord<'a>) -> (), @@ -226,18 +224,11 @@ where T: ?Sized + FnMut(&'static str) -> (), { let mut first = vec![0]; - match first_byte { - Some(b) => first[0] = b, - None => { - reader.read_exact(&mut first)?; - }, - } - if let Some(f) = type_callback { - match first[0] { - b'>' => f("FASTA"), - b'@' => f("FASTQ"), - _ => (), - } + reader.read_exact(&mut first)?; + match first[0] { + b'>' => type_callback("FASTA"), + b'@' => type_callback("FASTQ"), + _ => (), } let mut rec_reader = RecReader::new(reader, 10_000_000, &first)?; let mut record_count = 0; @@ -281,29 +272,31 @@ where Ok(()) } -/// Parse a array of bytes into FASTX records and calls `callback` on each. -pub fn fastx_bytes<'b, F>(bytes: &'b [u8], callback: F) -> Result<(), ParseError> +#[cfg(not(feature = "compression"))] +pub fn parse_sequences( + mut reader: R, + mut type_callback: T, + callback: F, +) -> Result<(), ParseError> where F: for<'a> FnMut(SeqRecord<'a>) -> (), + R: Read, + T: FnMut(&'static str) -> (), { - let mut cursor = Cursor::new(bytes); - fastx_reader( - &mut cursor, - None, - callback, - None::<&mut FnMut(&'static str) -> ()>, - ) + //! Opens a `Read` stream and parses the FASTX records out. Also takes a "type_callback" + //! that gets called as soon as we determine if the records are FASTA or FASTQ. + fastx_reader(&mut reader, callback, &mut type_callback) } #[cfg(feature = "compression")] -pub fn fastx_stream( +pub fn parse_sequences( mut reader: R, mut type_callback: T, callback: F, ) -> Result<(), ParseError> where F: for<'a> FnMut(SeqRecord<'a>) -> (), - R: Read + Seek, + R: Read, T: FnMut(&'static str) -> (), { //! Opens a `Read` stream and parses the FASTX records out. Also takes a "type_callback" @@ -320,9 +313,10 @@ where ParseErrorType::BadCompression, )); } - let _ = reader.seek(SeekFrom::Start(0)); - let mut gz_reader = MultiGzDecoder::new(reader); - fastx_reader(&mut gz_reader, None, callback, Some(&mut type_callback)) + let cursor = Cursor::new(vec![0x1F, 0x8B]); + let mut gz_reader = MultiGzDecoder::new(cursor.chain(reader)); + + fastx_reader(&mut gz_reader, callback, &mut type_callback) } else if first[0] == 0x42 { // bz files reader.read_exact(&mut first)?; @@ -332,9 +326,10 @@ where ParseErrorType::BadCompression, )); } - let _ = reader.seek(SeekFrom::Start(0)); - let mut bz_reader = BzDecoder::new(reader); - fastx_reader(&mut bz_reader, None, callback, Some(&mut type_callback)) + let cursor = Cursor::new(vec![0x42, 0x5A]); + let mut bz_reader = BzDecoder::new(cursor.chain(reader)); + + fastx_reader(&mut bz_reader, callback, &mut type_callback) } else if first[0] == 0xFD { // xz files reader.read_exact(&mut first)?; @@ -344,134 +339,61 @@ where ParseErrorType::BadCompression, )); } - let _ = reader.seek(SeekFrom::Start(0)); - let mut xz_reader = XzDecoder::new(reader); - fastx_reader(&mut xz_reader, None, callback, Some(&mut type_callback)) - } else if first[0] == 0x50 { - // zip files - reader.read_exact(&mut first)?; - if first[0] != 0x4b { - return Err(ParseError::new( - "Bad zip header", - ParseErrorType::BadCompression, - )); - } - let _ = reader.seek(SeekFrom::Start(0)); + let cursor = Cursor::new(vec![0xFD, 0x37]); + let mut xz_reader = XzDecoder::new(cursor.chain(reader)); - let mut zip_archive = ZipArchive::new(reader) - .map_err(|err| ParseError::new(err.to_string(), ParseErrorType::BadCompression))?; - if zip_archive.len() != 1 { - return Err(ParseError::new( - "Zip archives has more than one file", - ParseErrorType::BadCompression, - )); - } - let mut zip_reader = zip_archive.by_index(0)?; - fastx_reader(&mut zip_reader, None, callback, Some(&mut type_callback)) + fastx_reader(&mut xz_reader, callback, &mut type_callback) } else { - fastx_reader(&mut reader, Some(first[0]), callback, Some(&mut type_callback)) + let cursor = Cursor::new(first); + let mut reader = cursor.chain(reader); + fastx_reader(&mut reader, callback, &mut type_callback) } } -#[cfg(feature = "compression")] -pub fn fastx_cli( - filename: &str, - mut type_callback: T, - callback: F, -) -> Result<(), ParseError> -where - F: for<'a> FnMut(SeqRecord<'a>) -> (), - T: FnMut(&'static str) -> (), -{ - //! Opens files (or stdin, if a dash is provided instead) and reads FASTX records out. Also - //! takes a "type_callback" that gets called as soon as we determine if the records are FASTA - //! or FASTQ. If a file starts with a gzip or other header, transparently decompress it. - if filename == "-" { - let sin = stdin(); - let mut lock = sin.lock(); - return fastx_reader(&mut lock, None, callback, Some(&mut type_callback)); - } +#[cfg(test)] +mod test { + use std::fs::File; + use std::io::Cursor; + use std::path::Path; - let f = File::open(&Path::new(filename))?; - fastx_stream(f, type_callback, callback) -} + use crate::buffer::{RecBuffer, RecReader}; + use crate::fastx::{parse_sequences, FASTA, FASTQ}; + use crate::util::ParseErrorType; -#[test] -fn test_callback() { - let mut i = 0; - let res = fastx_bytes(&b">test\nAGCT\n>test2\nGATC"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(seq.qual, None); - }, - 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"GATC"[..]); - assert_eq!(seq.qual, None); - }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(res, Ok(())); - assert_eq!(i, 2); - - i = 0; - let res = fastx_cli( - "./tests/data/test.fa", - |filetype| { - assert_eq!(filetype, "FASTA"); - }, - |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], b"AGCTGATCGA"); - assert_eq!(seq.qual, None); - }, - 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], b"TAGC"); - assert_eq!(seq.qual, None); - }, - _ => unreachable!("Too many records"), - } - i += 1; - }, - ); - assert_eq!(res, Ok(())); - assert_eq!(i, 2); - - let res = fastx_cli( - "./tests/data/bad_test.fa", - |_| { - unreachable!("This is not a valid file type"); - }, - |_| { - unreachable!("No valid records in this file to parse"); - }, - ); - let e = res.unwrap_err(); - assert_eq!(e.error_type, ParseErrorType::InvalidHeader); - assert_eq!(e.msg, String::from("Bad starting byte")); -} + fn seq(s: &[u8]) -> Cursor<&[u8]> { + Cursor::new(&s[..]) + } -#[cfg(feature = "compression")] -#[test] -fn test_compressed() { - let test_files = [ - "./tests/data/test.fa.gz", - "./tests/data/test.fa.bz2", - "./tests/data/test.fa.xz", - "./tests/data/test.fa.zip", - ]; - - for test_file in test_files.iter() { + #[test] + fn test_callback() { let mut i = 0; - let res = fastx_cli( - test_file, + let res = parse_sequences( + seq(b">test\nAGCT\n>test2\nGATC"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(seq.qual, None); + }, + 1 => { + assert_eq!(seq.id, "test2"); + assert_eq!(&seq.seq[..], &b"GATC"[..]); + assert_eq!(seq.qual, None); + }, + _ => unreachable!("Too many records"), + } + i += 1; + }, + ); + assert_eq!(res, Ok(())); + assert_eq!(i, 2); + + i = 0; + let file = File::open(&Path::new("./tests/data/test.fa")).unwrap(); + let res = parse_sequences( + file, |filetype| { assert_eq!(filetype, "FASTA"); }, @@ -494,295 +416,387 @@ fn test_compressed() { ); assert_eq!(res, Ok(())); assert_eq!(i, 2); + + let file = File::open(&Path::new("./tests/data/bad_test.fa")).unwrap(); + let res = parse_sequences( + file, + |_| { + unreachable!("This is not a valid file type"); + }, + |_| { + unreachable!("No valid records in this file to parse"); + }, + ); + let e = res.unwrap_err(); + assert_eq!(e.error_type, ParseErrorType::InvalidHeader); + assert_eq!(e.msg, String::from("Bad starting byte")); } -} -#[test] -fn test_fastq() { - let mut i = 0; - let res = fastx_bytes( - &b"@test\nAGCT\n+test\n~~a!\n@test2\nTGCA\n+test\nWUI9"[..], - |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + #[cfg(feature = "compression")] + #[test] + fn test_compressed() { + let test_files = [ + "./tests/data/test.fa.gz", + "./tests/data/test.fa.bz2", + "./tests/data/test.fa.xz", + ]; + + for test_file in test_files.iter() { + let mut i = 0; + let file = File::open(&Path::new(test_file)).unwrap(); + let res = parse_sequences( + file, + |filetype| { + assert_eq!(filetype, "FASTA"); }, - 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"TGCA"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"WUI9"[..]); + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], b"AGCTGATCGA"); + assert_eq!(seq.qual, None); + }, + 1 => { + assert_eq!(seq.id, "test2"); + assert_eq!(&seq.seq[..], b"TAGC"); + assert_eq!(seq.qual, None); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - _ => unreachable!("Too many records"), - } - i += 1; - }, - ); - assert_eq!(i, 2); - assert_eq!(res, Ok(())); - - let mut i = 0; - let res = fastx_bytes( - &b"@test\r\nAGCT\r\n+test\r\n~~a!\r\n@test2\r\nTGCA\r\n+test\r\nWUI9"[..], - |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); - }, - 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"TGCA"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"WUI9"[..]); - }, - _ => unreachable!("Too many records"), - } - i += 1; - }, - ); - assert_eq!(res, Ok(())); - assert_eq!(i, 2); -} + ); + assert_eq!(res, Ok(())); + assert_eq!(i, 2); + } + } -#[test] -fn test_wrapped_fasta() { - let mut i = 0; - let res = fastx_bytes(&b">test\nAGCT\nTCG\n>test2\nG"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCTTCG"[..]); - assert_eq!(seq.qual, None); - }, - 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"G"[..]); - assert_eq!(seq.qual, None); + #[test] + fn test_fastq() { + let mut i = 0; + let res = parse_sequences( + seq(b"@test\nAGCT\n+test\n~~a!\n@test2\nTGCA\n+test\nWUI9"), + |_| (), + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + }, + 1 => { + assert_eq!(seq.id, "test2"); + assert_eq!(&seq.seq[..], &b"TGCA"[..]); + assert_eq!(&seq.qual.unwrap()[..], &b"WUI9"[..]); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(res, Ok(())); - assert_eq!(i, 2); - - let mut i = 0; - let res = fastx_bytes(&b">test\r\nAGCT\r\nTCG\r\n>test2\r\nG"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCTTCG"[..]); - assert_eq!(seq.qual, None); + ); + assert_eq!(i, 2); + assert_eq!(res, Ok(())); + + let mut i = 0; + let res = parse_sequences( + seq(b"@test\r\nAGCT\r\n+test\r\n~~a!\r\n@test2\r\nTGCA\r\n+test\r\nWUI9"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + }, + 1 => { + assert_eq!(seq.id, "test2"); + assert_eq!(&seq.seq[..], &b"TGCA"[..]); + assert_eq!(&seq.qual.unwrap()[..], &b"WUI9"[..]); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"G"[..]); - assert_eq!(seq.qual, None); + ); + assert_eq!(res, Ok(())); + assert_eq!(i, 2); + } + + #[test] + fn test_wrapped_fasta() { + let mut i = 0; + let res = parse_sequences( + seq(b">test\nAGCT\nTCG\n>test2\nG"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCTTCG"[..]); + assert_eq!(seq.qual, None); + }, + 1 => { + assert_eq!(seq.id, "test2"); + assert_eq!(&seq.seq[..], &b"G"[..]); + assert_eq!(seq.qual, None); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(res, Ok(())); - assert_eq!(i, 2); -} + ); + assert_eq!(res, Ok(())); + assert_eq!(i, 2); -#[test] -fn test_premature_endings() { - let mut i = 0; - let res = fastx_bytes(&b">test\nAGCT\n>test2"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(seq.qual, None); + let mut i = 0; + let res = parse_sequences( + seq(b">test\r\nAGCT\r\nTCG\r\n>test2\r\nG"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCTTCG"[..]); + assert_eq!(seq.qual, None); + }, + 1 => { + assert_eq!(seq.id, "test2"); + assert_eq!(&seq.seq[..], &b"G"[..]); + assert_eq!(seq.qual, None); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(i, 1); - let e = res.unwrap_err(); - assert_eq!(e.error_type, ParseErrorType::PrematureEOF); - - let mut i = 0; - let res = fastx_bytes(&b"@test\nAGCT\n+test\n~~a!\n@test2\nTGCA"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + ); + assert_eq!(res, Ok(())); + assert_eq!(i, 2); + } + + #[test] + fn test_premature_endings() { + let mut i = 0; + let res = parse_sequences( + seq(b">test\nAGCT\n>test2"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(seq.qual, None); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(i, 1); - let e = res.unwrap_err(); - assert_eq!(e.error_type, ParseErrorType::PrematureEOF); - - // we allow a few extra newlines at the ends of FASTQs - let mut i = 0; - let res = fastx_bytes(&b"@test\nAGCT\n+test\n~~a!\n\n"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + ); + assert_eq!(i, 1); + let e = res.unwrap_err(); + assert_eq!(e.error_type, ParseErrorType::PrematureEOF); + + let mut i = 0; + let res = parse_sequences( + seq(b"@test\nAGCT\n+test\n~~a!\n@test2\nTGCA"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(i, 1); - assert_eq!(res, Ok(())); - - // but if there's additional data past the newlines it's an error - let mut i = 0; - let res = fastx_bytes( - &b"@test\nAGCT\n+test\n~~a!\n\n@TEST\nA\n+TEST\n~"[..], - |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"AGCT"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); - }, - _ => unreachable!("Too many records"), - } - i += 1; - }, - ); - assert_eq!(i, 1); - let e = res.unwrap_err(); - assert_eq!(e.error_type, ParseErrorType::PrematureEOF); - - // test that an abrupt stop in a FASTA triggers an error - let mut i = 0; - let res = fastx_bytes( - &b">test\nACGT\n>test2\n"[..], - |seq| { - match i { - 0 => { - assert_eq!(seq.id, "test"); - assert_eq!(&seq.seq[..], &b"ACGT"[..]); - }, - _ => unreachable!("Too many records"), - } - i += 1; - }, - ); - assert_eq!(i, 1); - assert_eq!(res, Err(ParseError::PrematureEOF)); -} + ); + assert_eq!(i, 1); + let e = res.unwrap_err(); + assert_eq!(e.error_type, ParseErrorType::PrematureEOF); -#[test] -fn test_empty_records() { - let mut i = 0; - let res = fastx_bytes(&b"@\n\n+\n\n@test2\nTGCA\n+test2\n~~~~\n"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, ""); - assert_eq!(&seq.seq[..], &b""[..]); - assert_eq!(&seq.qual.unwrap()[..], &b""[..]); + // we allow a few extra newlines at the ends of FASTQs + let mut i = 0; + let res = parse_sequences( + seq(b"@test\nAGCT\n+test\n~~a!\n\n"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - 1 => { - assert_eq!(seq.id, "test2"); - assert_eq!(&seq.seq[..], &b"TGCA"[..]); - assert_eq!(&seq.qual.unwrap()[..], &b"~~~~"[..]); + ); + assert_eq!(i, 1); + assert_eq!(res, Ok(())); + + // but if there's additional data past the newlines it's an error + let mut i = 0; + let res = parse_sequences( + seq(b"@test\nAGCT\n+test\n~~a!\n\n@TEST\nA\n+TEST\n~"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"AGCT"[..]); + assert_eq!(&seq.qual.unwrap()[..], &b"~~a!"[..]); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(i, 2); - assert_eq!(res, Ok(())); - - let mut i = 0; - let res = fastx_bytes(&b">\n\n>shine\nAGGAGGU"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, ""); - assert_eq!(&seq.seq[..], &b""[..]); - assert_eq!(seq.qual, None); + ); + assert_eq!(i, 1); + let e = res.unwrap_err(); + assert_eq!(e.error_type, ParseErrorType::PrematureEOF); + + // test that an abrupt stop in a FASTA triggers an error + let mut i = 0; + let res = parse_sequences( + seq(b">test\nACGT\n>test2\n"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, "test"); + assert_eq!(&seq.seq[..], &b"ACGT"[..]); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - 1 => { - assert_eq!(seq.id, "shine"); - assert_eq!(&seq.seq[..], &b"AGGAGGU"[..]); - assert_eq!(seq.qual, None); + ); + assert_eq!(i, 1); + let e = res.unwrap_err(); + assert_eq!(e.error_type, ParseErrorType::PrematureEOF); + } + + #[test] + fn test_empty_records() { + let mut i = 0; + let res = parse_sequences( + seq(b"@\n\n+\n\n@test2\nTGCA\n+test2\n~~~~\n"), + |stype| { + assert_eq!(stype, "FASTQ"); }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(i, 2); - assert_eq!(res, Ok(())); - - let mut i = 0; - let res = fastx_bytes(&b">\r\n\r\n>shine\r\nAGGAGGU"[..], |seq| { - match i { - 0 => { - assert_eq!(seq.id, ""); - assert_eq!(&seq.seq[..], &b""[..]); - assert_eq!(seq.qual, None); + |seq| { + match i { + 0 => { + assert_eq!(seq.id, ""); + assert_eq!(&seq.seq[..], &b""[..]); + assert_eq!(&seq.qual.unwrap()[..], &b""[..]); + }, + 1 => { + assert_eq!(seq.id, "test2"); + assert_eq!(&seq.seq[..], &b"TGCA"[..]); + assert_eq!(&seq.qual.unwrap()[..], &b"~~~~"[..]); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - 1 => { - assert_eq!(seq.id, "shine"); - assert_eq!(&seq.seq[..], &b"AGGAGGU"[..]); - assert_eq!(seq.qual, None); + ); + assert_eq!(res, Ok(())); + assert_eq!(i, 2); + + let mut i = 0; + let res = parse_sequences( + seq(b">\n\n>shine\nAGGAGGU"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, ""); + assert_eq!(&seq.seq[..], &b""[..]); + assert_eq!(seq.qual, None); + }, + 1 => { + assert_eq!(seq.id, "shine"); + assert_eq!(&seq.seq[..], &b"AGGAGGU"[..]); + assert_eq!(seq.qual, None); + }, + _ => unreachable!("Too many records"), + } + i += 1; }, - _ => unreachable!("Too many records"), - } - i += 1; - }); - assert_eq!(i, 2); - assert_eq!(res, Ok(())); -} + ); + assert_eq!(i, 2); + assert_eq!(res, Ok(())); -#[test] -fn test_buffer() { - let mut buf: RecBuffer = RecBuffer::from_bytes(b">test\nACGT"); - let rec = buf.next().unwrap().unwrap(); - assert_eq!(rec.id, "test", "Record has the right ID"); - assert_eq!(rec.seq, b"ACGT", "Record has the right sequence"); + let mut i = 0; + let res = parse_sequences( + seq(b">\r\n\r\n>shine\r\nAGGAGGU"), + |_| {}, + |seq| { + match i { + 0 => { + assert_eq!(seq.id, ""); + assert_eq!(&seq.seq[..], &b""[..]); + assert_eq!(seq.qual, None); + }, + 1 => { + assert_eq!(seq.id, "shine"); + assert_eq!(&seq.seq[..], &b"AGGAGGU"[..]); + assert_eq!(seq.qual, None); + }, + _ => unreachable!("Too many records"), + } + i += 1; + }, + ); + assert_eq!(i, 2); + assert_eq!(res, Ok(())); + } - let mut buf: RecBuffer = RecBuffer::from_bytes(b">test"); - assert!(buf.next().is_none(), "Incomplete record returns None"); -} + #[test] + fn test_buffer() { + let mut buf: RecBuffer = RecBuffer::from_bytes(b">test\nACGT"); + let rec = buf.next().unwrap().unwrap(); + assert_eq!(rec.id, "test", "Record has the right ID"); + assert_eq!(rec.seq, b"ACGT", "Record has the right sequence"); -#[test] -fn test_fastq_across_buffer() { - let test_seq = b"@A\nA\n+A\nA\n@B\nA\n+B\n!"; - let mut cursor = Cursor::new(test_seq); - // the buffer is aligned to the first record - let mut rec_reader = RecReader::new(&mut cursor, 9, b"").unwrap(); + let mut buf: RecBuffer = RecBuffer::from_bytes(b">test"); + assert!(buf.next().is_none(), "Incomplete record returns None"); + } - let used = { - let mut rec_buffer = rec_reader.get_buffer::(0); - for _s in rec_buffer.by_ref() { - // record is incomplete - panic!("No initial record should be parsed") - } - rec_buffer.pos - }; + #[test] + fn test_fastq_across_buffer() { + let test_seq = b"@A\nA\n+A\nA\n@B\nA\n+B\n!"; + let mut cursor = Cursor::new(test_seq); + // the buffer is aligned to the first record + let mut rec_reader = RecReader::new(&mut cursor, 9, b"").unwrap(); + + let used = { + let mut rec_buffer = rec_reader.get_buffer::(0); + for _s in rec_buffer.by_ref() { + // record is incomplete + panic!("No initial record should be parsed") + } + rec_buffer.pos + }; - // refill the buffer, but we're not done quite yet - assert_eq!(rec_reader.refill(used).unwrap(), false); + // refill the buffer, but we're not done quite yet + assert_eq!(rec_reader.refill(used).unwrap(), false); - // now we should see both records - let mut rec_buffer = rec_reader.get_buffer::(0); + // now we should see both records + let mut rec_buffer = rec_reader.get_buffer::(0); - // there should be a record assuming the parser - // handled the buffer boundary - let iterated_seq = rec_buffer.by_ref().next(); - let seq = iterated_seq.unwrap(); - assert_eq!(seq.unwrap().id, "A"); + // there should be a record assuming the parser + // handled the buffer boundary + let iterated_seq = rec_buffer.by_ref().next(); + let seq = iterated_seq.unwrap(); + assert_eq!(seq.unwrap().id, "A"); - // but not another because the buffer's too short - let iterated_seq = rec_buffer.by_ref().next(); - assert!(iterated_seq.is_none()); + // but not another because the buffer's too short + let iterated_seq = rec_buffer.by_ref().next(); + assert!(iterated_seq.is_none()); - // TODO: refill and check for the last record + // TODO: refill and check for the last record + } } diff --git a/src/kmer.rs b/src/kmer.rs index e299272..48b31cf 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -67,10 +67,10 @@ pub fn canonical(seq: &[u8]) -> Cow<[u8]> { // loop through the kmer and its reverse complement simultaneously for (rn, n) in seq.iter().rev().map(|n| complement(*n)).zip(seq.iter()) { buf.push(rn); - if !enough && n < &rn { + if !enough && (*n < rn) { original_was_canonical = true; break; - } else if !enough && &rn < n { + } else if !enough && (rn < *n) { enough = true; } // unstated if branch: if rn == n, keep comparing @@ -101,10 +101,10 @@ pub fn minimizer(seq: &[u8], length: usize) -> Cow<[u8]> { let mut minmer = Cow::Borrowed(&seq[..length]); for (kmer, rc_kmer) in seq.windows(length).zip(reverse_complement.windows(length)) { - if kmer < &minmer[..] { + if *kmer < minmer[..] { minmer = kmer.into(); } - if rc_kmer < &minmer[..] { + if *rc_kmer < minmer[..] { minmer = rc_kmer.to_vec().into(); } } diff --git a/src/lib.rs b/src/lib.rs index 6c33bc3..1b85e51 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,18 +1,9 @@ #![crate_name = "needletail"] -extern crate memchr; - -#[cfg(feature = "compression")] -extern crate bzip2; -#[cfg(feature = "compression")] -extern crate flate2; -#[cfg(feature = "compression")] -extern crate xz2; -#[cfg(feature = "compression")] -extern crate zip; - pub mod bitkmer; mod buffer; pub mod fastx; pub mod kmer; pub mod seq; mod util; + +pub use fastx::parse_sequences; diff --git a/src/seq.rs b/src/seq.rs index 2d47c94..1946453 100644 --- a/src/seq.rs +++ b/src/seq.rs @@ -134,15 +134,7 @@ impl<'a> SeqRecord<'a> { .seq .iter() .zip(qual.iter()) - .map( - |(base, qual)| { - if *qual < score { - b'N' - } else { - *base - } - }, - ) + .map(|(base, qual)| if *qual < score { b'N' } else { *base }) .collect(); SeqRecord { id: self.id, @@ -153,24 +145,22 @@ impl<'a> SeqRecord<'a> { } /// Capitalize everything and mask unknown bases to N - pub fn normalize(&'a mut self, iupac: bool) -> bool { + pub fn normalize(mut self, iupac: bool) -> Self { let (seq, changed) = normalize(&self.seq, iupac); if changed { self.seq = seq.into(); } - changed + self } /// Mask tabs in header lines to `|`s /// /// Returns `true` if the header was masked - pub fn mask_header(&mut self) -> bool { + pub fn mask_header(mut self) -> Self { if memchr(b'\t', self.id.as_ref().as_bytes()).is_some() { self.id = self.id.as_ref().replace("\t", "|").into(); - true - } else { - false } + self } /// Return an iterator the returns valid kmers diff --git a/src/util.rs b/src/util.rs index 8a71320..a38b297 100644 --- a/src/util.rs +++ b/src/util.rs @@ -6,9 +6,6 @@ use std::str; use memchr::{memchr, memchr2}; -#[cfg(feature = "compression")] -use zip::result::ZipError; - #[derive(Clone, Debug, PartialEq)] pub enum ParseErrorType { BadCompression, @@ -73,7 +70,7 @@ impl error::Error for ParseError { "ParseError" } - fn cause(&self) -> Option<&error::Error> { + fn cause(&self) -> Option<&dyn error::Error> { None } } @@ -90,13 +87,6 @@ impl From for ParseError { } } -#[cfg(feature = "compression")] -impl From for ParseError { - fn from(err: ZipError) -> ParseError { - ParseError::new(err.to_string(), ParseErrorType::BadCompression) - } -} - /// remove newlines from within FASTX records; currently the rate limiting step /// in FASTX parsing (in general; readfq also exhibits this behavior) #[inline] diff --git a/benches/benchmark.rs b/tests/benchmark.rs similarity index 69% rename from benches/benchmark.rs rename to tests/benchmark.rs index 9197e43..4d6ba5d 100644 --- a/benches/benchmark.rs +++ b/tests/benchmark.rs @@ -3,25 +3,26 @@ extern crate bencher; extern crate needletail; use bencher::Bencher; -use needletail::{bitkmer, fastx, kmer}; +use needletail::fastx; use std::fs::File; -use std::io::Read; +use std::io::{Cursor, Read}; // from Bio.SeqIO import parse // n_total = sum([len([k for k in slid_win(i.seq, 31) if set(k).issubset({'A', 'C', 'G', 'T'})]) for i in SeqIO.parse('./tests/data/28S.fasta', 'fasta')]) fn bench_kmer_speed(bench: &mut Bencher) { - let filename = String::from("tests/data/28S.fasta"); + let filename = "tests/data/28S.fasta"; let ksize = 31; bench.iter(|| { let mut n_total = 0; let mut n_canonical = 0; - fastx::fastx_cli( - &filename[..], + let file = File::open(filename).unwrap(); + fastx::parse_sequences( + file, |_| {}, |seq| { - for (_, kmer, was_rc) in seq.normalize(true).kmers(ksize, true) { + for (_, _kmer, was_rc) in seq.normalize(true).kmers(ksize, true) { if !was_rc { n_canonical += 1; } @@ -36,17 +37,18 @@ fn bench_kmer_speed(bench: &mut Bencher) { } fn bench_bitkmer_speed(bench: &mut Bencher) { - let filename = String::from("tests/data/28S.fasta"); + let filename = "tests/data/28S.fasta"; let ksize = 31; bench.iter(|| { let mut n_total = 0; let mut n_canonical = 0; - fastx::fastx_cli( - &filename[..], + let file = File::open(filename).unwrap(); + fastx::parse_sequences( + file, |_| {}, |seq| { - for (_, k, was_rc) in seq.bit_kmers(ksize, true) { + for (_, _kmer, was_rc) in seq.bit_kmers(ksize, true) { if !was_rc { n_canonical += 1; } @@ -63,33 +65,33 @@ fn bench_bitkmer_speed(bench: &mut Bencher) { benchmark_group!(kmers, bench_kmer_speed, bench_bitkmer_speed); fn bench_fastq_bytes(bench: &mut Bencher) { - let filename = String::from("tests/data/PRJNA271013_head.fq"); - let ksize = 31; - let mut data: Vec = vec![]; - let mut f = File::open(filename).unwrap(); - f.read_to_end(&mut data); + let mut f = File::open("tests/data/PRJNA271013_head.fq").unwrap(); + let _ = f.read_to_end(&mut data); bench.iter(|| { let mut n_bases = 0; - fastx::fastx_bytes(&data, |seq| { - n_bases += seq.seq.len(); - }) + fastx::parse_sequences( + Cursor::new(&data), + |_| {}, + |seq| { + n_bases += seq.seq.len(); + }, + ) .unwrap(); assert_eq!(250000, n_bases); }) } fn bench_fastq_file(bench: &mut Bencher) { - let filename = String::from("tests/data/PRJNA271013_head.fq"); - let ksize = 31; + let filename = "tests/data/PRJNA271013_head.fq"; // warming up the cache doesn't seem to make the timings more repeatable? // fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap(); bench.iter(|| { let mut n_bases = 0; - fastx::fastx_cli( - &filename[..], + fastx::parse_sequences( + File::open(filename).unwrap(), |_| {}, |seq| { n_bases += seq.seq.len(); @@ -102,32 +104,34 @@ fn bench_fastq_file(bench: &mut Bencher) { fn bench_fasta_bytes(bench: &mut Bencher) { let filename = String::from("tests/data/28S.fasta"); - let ksize = 31; let mut data: Vec = vec![]; let mut f = File::open(filename).unwrap(); - f.read_to_end(&mut data); + let _ = f.read_to_end(&mut data); bench.iter(|| { let mut n_bases = 0; - fastx::fastx_bytes(&data, |seq| { - n_bases += seq.seq.len(); - }) + fastx::parse_sequences( + Cursor::new(&data), + |_| {}, + |seq| { + n_bases += seq.seq.len(); + }, + ) .unwrap(); assert_eq!(738580, n_bases); }) } fn bench_fasta_file(bench: &mut Bencher) { - let filename = String::from("tests/data/28S.fasta"); - let ksize = 31; + let filename = "tests/data/28S.fasta"; // warming up the cache doesn't seem to make the timings more repeatable? // fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap(); bench.iter(|| { let mut n_bases = 0; - fastx::fastx_cli( - &filename[..], + fastx::parse_sequences( + File::open(filename).unwrap(), |_| {}, |seq| { n_bases += seq.seq.len(); diff --git a/tests/data/test.fa.zip b/tests/data/test.fa.zip deleted file mode 100644 index 33409264ed515bb82594e65c0c33e6025350e4e4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 183 zcmWIWW@Zs#-~dAQB2ixkB*4xfz)+G}T%wnj7#hLDpxRgyEeph78ATX2YkTVIJuE6I z=PNAdDJ(00vWb~fqOg^rbq7O$H#>)g>}@q^ph>bo9N^8!B+87i0a=cnfd^)v!;(f2 Z6K)>|+`a&BRyL3nBM>?Q=>!mm0RY|?A;