From 1f5ac3a24849dd8fd119609ab571a9ba5c4345d6 Mon Sep 17 00:00:00 2001 From: Roderick Bovee Date: Fri, 6 Sep 2019 16:52:59 -0700 Subject: [PATCH] Add test against Specimens.jl, improve buffer perf & error msging --- .circleci/config.yml | 124 ++++++++++++++++++++++++++++++++++--- Cargo.toml | 9 +++ benches/benchmark.rs | 72 +++++---------------- rustfmt.toml | 3 - src/bitkmer.rs | 26 ++++---- src/formats/buffer.rs | 95 +++++++++++----------------- src/formats/fasta.rs | 56 +++++++++-------- src/formats/fastq.rs | 113 +++++++++++++++++++++++++-------- src/formats/mod.rs | 8 +-- src/kmer.rs | 2 +- src/lib.rs | 2 +- src/seq.rs | 42 +++++-------- src/util.rs | 4 +- tests/format_specimens.rs | 127 ++++++++++++++++++++++++++++++++++++++ 14 files changed, 454 insertions(+), 229 deletions(-) delete mode 100644 rustfmt.toml create mode 100644 tests/format_specimens.rs diff --git a/.circleci/config.yml b/.circleci/config.yml index 211e956..587db5e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,10 +1,32 @@ -version: 2 -jobs: - test: - docker: - - image: circleci/rust:1.37-stretch +version: 2.1 + +executors: + needletail: + machine: + image: ubuntu-1604:201903-01 + +commands: + checkout_and_setup: + description: "Checkout code and set up rust" steps: - checkout + - restore_cache: + name: restore rust install + keys: + - rust-stable + - run: + name: set up rust nightly + command: | + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --no-modify-path --default-toolchain stable + echo 'export PATH=$HOME/.cargo/bin:$PATH' >> $BASH_ENV + source $HOME/.cargo/env + rustup show + - save_cache: + name: cache rust install + paths: + - ~/.rustup/ + - ~/.cargo/ + key: rust-stable - run: name: Version information command: rustc --version; cargo --version; rustup --version @@ -12,24 +34,106 @@ jobs: name: Calculate dependencies command: cargo generate-lockfile - restore_cache: + name: restore project build artifacts keys: - v4-cargo-cache-{{ arch }}-{{ checksum "Cargo.lock" }} - run: name: Build all targets - command: cargo build --all --all-targets + command: cargo build --all --all-targets --all-features - save_cache: + name: save project build artifacts paths: - - /usr/local/cargo/registry + - ~/.cargo/registry - target/debug/.fingerprint - target/debug/build - target/debug/deps key: v4-cargo-cache-{{ arch }}-{{ checksum "Cargo.lock" }} + +jobs: + build: + executor: needletail + steps: + - checkout_and_setup + test: + executor: needletail + steps: + - checkout_and_setup - run: name: Run all tests - command: cargo test --all + command: cargo test --all --all-features + - run: + name: Run slow tests + command: cargo test -- --ignored + lint: + executor: needletail + steps: + - checkout_and_setup + - run: + name: Format + command: | + rustup component add rustfmt + cargo fmt --all -- --check + - run: + name: Clippy + command: | + rustup component add clippy + cargo clippy --all-features -- -D warnings + coverage: + executor: needletail + steps: + - checkout_and_setup + - restore_cache: + keys: + - cargo-tarpaulin-0.8.6 + - run: + name: install cargo tarpaulin + command: cargo install cargo-tarpaulin --version 0.8.6 || echo "cargo-tarpaulin already installed" + environment: + RUSTFLAGS: --cfg procmacro2_semver_exempt + - save_cache: + paths: + - ~/.cargo/bin/cargo-tarpaulin + key: cargo-tarpaulin-0.8.6 + - run: + name: Generate coverage report + command: cargo tarpaulin --out Xml --all --all-features -t 600 + environment: + LZMA_API_STATIC: 1 + - run: + name: Export coverage to codecov + command: bash <(curl -s https://codecov.io/bash) || echo "Failed to upload coverage" + bench: + # TODO: probably need to do something useful here (use critcmp?) before turning this on + executor: needletail + steps: + - checkout_and_setup + - run: + name: Run benchmarks + command: | + cargo bench + fuzz: + # TODO: need to figure out how to install nightly here and probably cache the cargo-fuzz binary + executor: needletail + steps: + - checkout_and_setup + - run: + name: Run fuzz for 3 minutes each + command: | + cargo +nightly install cargo-fuzz + cargo +nightly fuzz run parse_fasta -- -max_total_time=180 + cargo +nightly fuzz run parse_fastq -- -max_total_time=180 workflows: version: 2 - tests: + ci-checks: jobs: - - test + - build + - coverage: + requires: + - build + - test: + requires: + - build + - lint: + requires: + - build diff --git a/Cargo.toml b/Cargo.toml index 8871409..becf607 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,12 +19,21 @@ flate2 = { version="1.0.6", optional=true } bzip2 = { version="0.3.3", optional=true } xz2 = { version="0.1.6", optional=true } memchr = "2.2.1" +safemem = "0.3.2" [dev-dependencies] criterion = "0.3" + +# for benchmark comparisons bio = "0.28" seq_io = "0.3" +# for testing with the FormatSpecimens.jl repo +reqwest = "0.9" +toml = "0.4" +serde = "1.0" +serde_derive = "1.0" + [[bench]] name = "benchmark" harness = false diff --git a/benches/benchmark.rs b/benches/benchmark.rs index a04d863..3a02877 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -38,8 +38,8 @@ fn bench_kmer_speed(c: &mut Criterion) { }, ) .unwrap(); - assert_eq!(718007, n_total); - assert_eq!(350983, n_canonical); + assert_eq!(718_007, n_total); + assert_eq!(350_983, n_canonical); }); }); @@ -61,8 +61,8 @@ fn bench_kmer_speed(c: &mut Criterion) { }, ) .unwrap(); - assert_eq!(718007, n_total); - assert_eq!(350983, n_canonical); + assert_eq!(718_007, n_total); + assert_eq!(350_983, n_canonical); }); }); } @@ -89,7 +89,7 @@ fn bench_fastq_file(c: &mut Criterion) { let record = record.unwrap(); n_bases += record.seq().len() } - assert_eq!(250000, n_bases); + assert_eq!(250_000, n_bases); }); }); @@ -103,7 +103,7 @@ fn bench_fastq_file(c: &mut Criterion) { let seqlen = record.seq().len(); n_bases += seqlen; } - assert_eq!(250000, n_bases); + assert_eq!(250_000, n_bases); }); }); @@ -119,38 +119,19 @@ fn bench_fastq_file(c: &mut Criterion) { }, ) .unwrap(); - assert_eq!(250000, n_bases); - }); - }); - - group.bench_function("Needletail (Macro)", |bench| { - use needletail::formats::FastqReader; - use needletail::{parse_stream, ParseError}; - #[inline] - fn get_n_bases(mut fastq_data: &mut dyn Read) -> Result { - let mut n_bases = 0; - parse_stream!(&mut fastq_data, &b""[..], FastqReader, rec, { - n_bases += rec.seq.len(); - }); - Ok(n_bases) - } - - bench.iter(|| { - let mut fastq_data = Cursor::new(data.clone()); - let n_bases = get_n_bases(&mut fastq_data).unwrap(); - assert_eq!(250000, n_bases); + assert_eq!(250_000, n_bases); }); }); group.bench_function("Needletail (No Buffer)", |bench| { - use needletail::formats::{FastqReader, RecReader}; + use needletail::formats::{FastqParser, RecParser}; bench.iter(|| { - let mut reader = FastqReader::from_buffer(&data, true); + let mut reader = FastqParser::from_buffer(&data, true); let mut n_bases = 0; for seq in reader.by_ref() { n_bases += seq.unwrap().seq.len(); } - assert_eq!(250000, n_bases); + assert_eq!(250_000, n_bases); }); }); } @@ -174,7 +155,7 @@ fn bench_fasta_file(c: &mut Criterion) { let record = record.unwrap(); n_bases += record.seq().len() } - assert_eq!(738580, n_bases); + assert_eq!(738_580, n_bases); }); }); @@ -189,7 +170,7 @@ fn bench_fasta_file(c: &mut Criterion) { n_bases += s.len(); } } - assert_eq!(738580, n_bases); + assert_eq!(738_580, n_bases); }); }); @@ -205,42 +186,21 @@ fn bench_fasta_file(c: &mut Criterion) { }, ) .unwrap(); - assert_eq!(738580, n_bases); - }); - }); - - group.bench_function("Needletail (Macro)", |bench| { - use needletail::formats::FastaReader; - use needletail::seq::Sequence; - use needletail::{parse_stream, ParseError}; - #[inline] - fn get_n_bases(mut fasta_data: &mut dyn Read) -> Result { - let mut n_bases = 0; - parse_stream!(&mut fasta_data, &b""[..], FastaReader, rec, { - let seq = Sequence::from(rec); - n_bases += seq.seq.len(); - }); - Ok(n_bases) - } - - bench.iter(|| { - let mut fasta_data = Cursor::new(data.clone()); - let n_bases = get_n_bases(&mut fasta_data).unwrap(); - assert_eq!(738580, n_bases); + assert_eq!(738_580, n_bases); }); }); group.bench_function("Needletail (No Buffer)", |bench| { - use needletail::formats::{FastaReader, RecReader}; + use needletail::formats::{FastaParser, RecParser}; use needletail::seq::Sequence; bench.iter(|| { - let mut reader = FastaReader::from_buffer(&data, true); + let mut reader = FastaParser::from_buffer(&data, true); let mut n_bases = 0; for rec in reader.by_ref() { let seq = Sequence::from(rec.unwrap()); n_bases += seq.seq.len(); } - assert_eq!(738580, n_bases); + assert_eq!(738_580, n_bases); }); }); } diff --git a/rustfmt.toml b/rustfmt.toml deleted file mode 100644 index ec8ec17..0000000 --- a/rustfmt.toml +++ /dev/null @@ -1,3 +0,0 @@ -match_block_trailing_comma = true -newline_style = "Unix" -edition = "2018" diff --git a/src/bitkmer.rs b/src/bitkmer.rs index 00411b4..dda0146 100644 --- a/src/bitkmer.rs +++ b/src/bitkmer.rs @@ -140,14 +140,14 @@ fn can_kmerize() { #[test] fn test_iterator() { - let seq = "ACGTA".as_bytes(); + let seq = b"ACGTA"; let mut kmer_iter = BitNuclKmer::new(seq, 3, false); assert_eq!(kmer_iter.next(), Some((0, (6, 3), false))); assert_eq!(kmer_iter.next(), Some((1, (27, 3), false))); assert_eq!(kmer_iter.next(), Some((2, (44, 3), false))); assert_eq!(kmer_iter.next(), None); - let seq = "TA".as_bytes(); + let seq = b"TA"; let mut kmer_iter = BitNuclKmer::new(seq, 3, false); assert_eq!(kmer_iter.next(), None); } @@ -172,10 +172,10 @@ pub fn reverse_complement(kmer: BitKmer) -> BitKmer { #[test] fn test_reverse_complement() { - assert_eq!(reverse_complement((0b000000, 3)).0, 0b111111); - assert_eq!(reverse_complement((0b111111, 3)).0, 0b000000); - assert_eq!(reverse_complement((0b00000000, 4)).0, 0b11111111); - assert_eq!(reverse_complement((0b00011011, 4)).0, 0b00011011); + assert_eq!(reverse_complement((0b00_0000, 3)).0, 0b11_1111); + assert_eq!(reverse_complement((0b11_1111, 3)).0, 0b00_0000); + assert_eq!(reverse_complement((0b0000_0000, 4)).0, 0b1111_1111); + assert_eq!(reverse_complement((0b0001_1011, 4)).0, 0b0001_1011); } /// Return the lexigraphically lowest of the BitKmer and its reverse complement and @@ -210,10 +210,10 @@ pub fn minimizer(kmer: BitKmer, minmer_size: u8) -> BitKmer { #[test] fn test_minimizer() { - assert_eq!(minimizer((0b001011, 3), 2).0, 0b0010); - assert_eq!(minimizer((0b001011, 3), 1).0, 0b00); - assert_eq!(minimizer((0b11000011, 4), 2).0, 0b0000); - assert_eq!(minimizer((0b110001, 3), 2).0, 0b0001); + assert_eq!(minimizer((0b00_1011, 3), 2).0, 0b0010); + assert_eq!(minimizer((0b00_1011, 3), 1).0, 0b00); + assert_eq!(minimizer((0b1100_0011, 4), 2).0, 0b0000); + assert_eq!(minimizer((0b11_0001, 3), 2).0, 0b0001); } pub fn bitmer_to_bytes(kmer: BitKmer) -> Vec { @@ -259,12 +259,12 @@ pub fn bytes_to_bitmer(kmer: &[u8]) -> BitKmer { #[test] fn test_bytes_to_bitkmer() { - let mut ikmer: BitKmer = bytes_to_bitmer("C".as_bytes()); + let mut ikmer: BitKmer = bytes_to_bitmer(b"C"); assert_eq!(ikmer.0, 1 as BitKmerSeq); - ikmer = bytes_to_bitmer("TTA".as_bytes()); + ikmer = bytes_to_bitmer(b"TTA"); assert_eq!(ikmer.0, 60 as BitKmerSeq); - ikmer = bytes_to_bitmer("AAA".as_bytes()); + ikmer = bytes_to_bitmer(b"AAA"); assert_eq!(ikmer.0, 0 as BitKmerSeq); } diff --git a/src/formats/buffer.rs b/src/formats/buffer.rs index 63d833f..cc1640f 100644 --- a/src/formats/buffer.rs +++ b/src/formats/buffer.rs @@ -1,29 +1,11 @@ use std::io; -use std::marker::PhantomData; -use crate::util::ParseError; +use safemem::copy_over; -#[inline] -fn fill_buffer( - file: &mut dyn io::Read, - data: &[u8], - buf_size: usize, -) -> Result<(Vec, bool), ParseError> { - let mut buf = Vec::with_capacity(buf_size + data.len()); - unsafe { - buf.set_len(buf_size + data.len()); - } - buf[..data.len()].copy_from_slice(data); - let amt_read = file.read(&mut buf[data.len()..])?; - unsafe { - buf.set_len(amt_read + data.len()); - } - Ok((buf, amt_read == 0)) -} +use crate::util::ParseError; -pub struct RecBuffer<'a, T> { - rec_type: PhantomData, - file: Option<&'a mut dyn io::Read>, +pub struct RecBuffer<'a> { + file: &'a mut dyn io::Read, pub buf: Vec, pub last: bool, } @@ -31,10 +13,7 @@ pub struct RecBuffer<'a, T> { /// A buffer that wraps an object with the `Read` trait and allows extracting /// a set of slices to data. Acts as a lower-level primitive for our FASTX /// readers. -impl<'a, 'b, T> RecBuffer<'a, T> -where - T: RecParser<'b>, -{ +impl<'a> RecBuffer<'a> { /// Instantiate a new buffer. /// /// # Panics @@ -45,33 +24,24 @@ where file: &'a mut dyn io::Read, buf_size: usize, header: &[u8], - ) -> Result, ParseError> { - let (buf, last) = fill_buffer(file, header, buf_size)?; - Ok(RecBuffer { - rec_type: PhantomData, - file: Some(file), - last, - buf, - }) - } + ) -> Result, ParseError> { + let mut buf = Vec::with_capacity(buf_size); + unsafe { + buf.set_len(buf_size + header.len()); + } + buf[..header.len()].copy_from_slice(header); + let amt_read = file.read(&mut buf[header.len()..])?; + unsafe { + buf.set_len(amt_read + header.len()); + } - pub fn new_chunked() -> Result, ParseError> { Ok(RecBuffer { - rec_type: PhantomData, - file: None, - last: false, - buf: Vec::new(), + file, + last: amt_read == 0, + buf, }) } - pub fn fill(&mut self, data: &[u8], last: bool) -> Result<(), ParseError> { - let mut data = io::Cursor::new(data); - let (buf, _) = fill_buffer(&mut data, &self.buf, self.buf.capacity())?; - self.buf = buf; - self.last = last; - Ok(()) - } - /// Refill the buffer and increase its capacity if it's not big enough. /// Takes a tuple of the bytes used and how many records returned so far. #[inline] @@ -79,20 +49,25 @@ where if used == 0 && self.last { return Ok(true); } - let data = &self.buf[used..]; - let (buf, last) = if let Some(f) = &mut self.file { - fill_buffer(f, &data, self.buf.capacity())? - } else { - (data.to_vec(), self.last) - }; - self.buf = buf; - self.last = last; + let remaining = self.buf.len() - used; + if used == 0 { + // double the buffer size (i tried using buf.reserve, but that doesn't work _at all_) + let mut new_buf = Vec::with_capacity(2 * self.buf.capacity()); + unsafe { + new_buf.set_len(new_buf.capacity()); + } + new_buf[..self.buf.len()].copy_from_slice(&self.buf); + self.buf = new_buf; + } else if remaining != 0 { + copy_over(&mut self.buf, used, 0, remaining); + } + let amt_read = self.file.read(&mut self.buf[remaining..])?; + unsafe { + self.buf.set_len(remaining + amt_read); + } + self.last = amt_read == 0; Ok(false) } - - pub fn get_reader(&'b self) -> T { - T::from_buffer(&self.buf, self.last) - } } /// RecParser is an adaptor trait that allows new file format parsers to be diff --git a/src/formats/fasta.rs b/src/formats/fasta.rs index fbe7f37..0a2045b 100644 --- a/src/formats/fasta.rs +++ b/src/formats/fasta.rs @@ -1,3 +1,4 @@ +use std::cmp::min; use std::io::Write; use memchr::memchr; @@ -45,12 +46,17 @@ pub struct FastaParser<'a> { } impl<'a> FastaParser<'a> { - pub fn new(buf: &'a [u8]) -> Self { - FastaParser { - buf, - last: true, - pos: 0, + pub fn new(buf: &'a [u8], last: bool) -> Result { + if buf[0] != b'>' { + let context = String::from_utf8_lossy(&buf[..min(32, buf.len())]); + return Err(ParseError::new( + "FASTA record must start with '>'", + ParseErrorType::InvalidHeader, + ) + .context(context)); } + + Ok(FastaParser { buf, last, pos: 0 }) } } @@ -119,8 +125,6 @@ impl<'a> RecParser<'a> for FastaParser<'a> { } pub fn check_end(buf: &[u8], last: bool) -> Result<(), ParseError> { - use std::cmp::min; - // check if there's anything left stuff in the buffer (besides returns) if !last { return Err( @@ -130,7 +134,7 @@ pub fn check_end(buf: &[u8], last: bool) -> Result<(), ParseError> { } for c in &buf[..] { if c != &b'\r' && c != &b'\n' { - let end = min(16, buf.len()); + let end = min(32, buf.len()); let context = String::from_utf8_lossy(&buf[..end]); return Err(ParseError::new( "File had extra data past end of records", @@ -169,12 +173,12 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(seq.qual, None); - }, + } 1 => { assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"GATC"); assert_eq!(seq.qual, None); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -196,12 +200,12 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCTGATCGA"); assert_eq!(seq.qual, None); - }, + } 1 => { assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"TAGC"); assert_eq!(seq.qual, None); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -249,12 +253,12 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCTGATCGA"); assert_eq!(seq.qual, None); - }, + } 1 => { assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"TAGC"); assert_eq!(seq.qual, None); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -277,12 +281,12 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCTTCG"); assert_eq!(seq.qual, None); - }, + } 1 => { assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"G"); assert_eq!(seq.qual, None); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -301,12 +305,12 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCTTCG"); assert_eq!(seq.qual, None); - }, + } 1 => { assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"G"); assert_eq!(seq.qual, None); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -328,7 +332,7 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(seq.qual, None); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -349,7 +353,7 @@ mod test { 0 => { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"ACGT"); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -373,12 +377,12 @@ mod test { assert_eq!(&seq.id[..], b""); assert_eq!(&seq.seq[..], b""); assert_eq!(seq.qual, None); - }, + } 1 => { assert_eq!(&seq.id[..], b"shine"); assert_eq!(&seq.seq[..], b"AGGAGGU"); assert_eq!(seq.qual, None); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -397,12 +401,12 @@ mod test { assert_eq!(&seq.id[..], b""); assert_eq!(&seq.seq[..], b""); assert_eq!(seq.qual, None); - }, + } 1 => { assert_eq!(&seq.id[..], b"shine"); assert_eq!(&seq.seq[..], b"AGGAGGU"); assert_eq!(seq.qual, None); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -414,12 +418,12 @@ mod test { #[test] fn test_reader() { - let mut reader = FastaParser::new(b">test\nACGT"); + let mut reader = FastaParser::new(b">test\nACGT", true).unwrap(); let rec = reader.next().unwrap().unwrap(); assert_eq!(rec.id, b"test", "Record has the right ID"); assert_eq!(rec.seq, b"ACGT", "Record has the right sequence"); - let mut reader = FastaParser::new(b">test"); + let mut reader = FastaParser::new(b">test", true).unwrap(); assert!(reader.next().is_none(), "Incomplete record returns None"); } } diff --git a/src/formats/fastq.rs b/src/formats/fastq.rs index e6042e8..9eb355a 100644 --- a/src/formats/fastq.rs +++ b/src/formats/fastq.rs @@ -66,27 +66,37 @@ pub struct FastqParser<'a> { pos: usize, } +impl<'a> FastqParser<'a> { + pub fn new(buf: &'a [u8], last: bool) -> Result { + if buf[0] != b'@' { + // sometimes there are extra returns at the end of a file so we shouldn't blow up + if !(last && (buf[0] == b'\r' && buf[0] == b'\n')) { + let context = String::from_utf8_lossy(&buf[..min(32, buf.len())]); + let e = ParseError::new( + "FASTQ record must start with '@'", + ParseErrorType::InvalidHeader, + ) + .context(context); + return Err(e); + } + } + + Ok(FastqParser { buf, last, pos: 0 }) + } +} + impl<'a> Iterator for FastqParser<'a> { type Item = Result, ParseError>; #[inline] fn next(&mut self) -> Option { - if self.pos >= self.buf.len() { + let buf = &self.buf[self.pos..]; + if buf.is_empty() { return None; } - let buf = &self.buf[self.pos..]; - - if buf[0] != b'@' { - // sometimes there are extra returns at the end of a file so we shouldn't blow up - if buf[0] == b'\r' || buf[0] == b'\n' { - return None; - } else { - let context = String::from_utf8_lossy(&buf[..min(16, buf.len())]); - let e = - ParseError::new("Record must start with '@'", ParseErrorType::InvalidHeader) - .context(context); - return Some(Err(e)); - } + if buf[0] == b'\n' { + // sometimes the last "record" is just newlines + return None; } let id_end; @@ -129,6 +139,20 @@ impl<'a> Iterator for FastqParser<'a> { } let mut qual = &buf[id2_end..qual_end - 1]; + if (qual_end + 1 < buf.len() + && buf[qual_end] != b'@' + && buf[qual_end] != b'\r' + && buf[qual_end] != b'\n') + || (qual_end < buf.len() && buf[qual_end - 1] != b'\n') + { + let context = String::from_utf8_lossy(id); + return Some(Err(ParseError::new( + "Sequence and quality lengths differed", + ParseErrorType::InvalidRecord, + ) + .context(context))); + } + // clean up any extra '\r' from the id and seq if !id.is_empty() && id[id.len() - 1] == b'\r' { id = &id[..id.len() - 1]; @@ -140,6 +164,16 @@ impl<'a> Iterator for FastqParser<'a> { if !qual.is_empty() && qual[qual.len() - 1] == b'\r' { qual = &qual[..qual.len() - 1]; } + if !qual.is_empty() && qual[qual.len() - 1] == b'\n' { + // special case for FASTQs that are a single character short on the + // quality line, but still have a terminal newline + let context = String::from_utf8_lossy(id); + return Some(Err(ParseError::new( + "Quality length was shorter than expected", + ParseErrorType::PrematureEOF, + ) + .context(context))); + } self.pos += buffer_used; Some(Ok(Fastq { id, seq, id2, qual })) @@ -191,12 +225,12 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); - }, + } 1 => { assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"TGCA"); assert_eq!(&seq.qual.unwrap()[..], b"WUI9"); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -215,12 +249,12 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); - }, + } 1 => { assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"TGCA"); assert_eq!(&seq.qual.unwrap()[..], b"WUI9"); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -241,6 +275,13 @@ mod test { #[test] fn test_premature_endings() { + let test = b"@test\nACGT\n+\nIII\n"; + let mut fp = FastqParser::new(test, true).unwrap(); + let result = fp.next().unwrap(); + assert!(result.is_err()); + let e = result.unwrap_err(); + assert!(e.error_type == ParseErrorType::PrematureEOF); + let mut i = 0; let res = parse_sequences( seq(b"@test\nAGCT\n+test\n~~a!\n@test2\nTGCA"), @@ -251,7 +292,7 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -273,7 +314,7 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -283,6 +324,9 @@ mod test { assert_eq!(res, Ok(())); // but if there's additional data past the newlines it's an error + // note this is slightly easier to output than the "Sequence and + // quality lengths differed" error because the end of the file may + // normally have multiple newlines let mut i = 0; let res = parse_sequences( seq(b"@test\nAGCT\n+test\n~~a!\n\n@TEST\nA\n+TEST\n~"), @@ -293,7 +337,7 @@ mod test { assert_eq!(&seq.id[..], b"test"); assert_eq!(&seq.seq[..], b"AGCT"); assert_eq!(&seq.qual.unwrap()[..], b"~~a!"); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -319,12 +363,12 @@ mod test { assert_eq!(&seq.id[..], b""); assert_eq!(&seq.seq[..], b""); assert_eq!(&seq.qual.unwrap()[..], b""); - }, + } 1 => { assert_eq!(&seq.id[..], b"test2"); assert_eq!(&seq.seq[..], b"TGCA"); assert_eq!(&seq.qual.unwrap()[..], b"~~~~"); - }, + } _ => unreachable!("Too many records"), } i += 1; @@ -334,15 +378,32 @@ mod test { assert_eq!(i, 2); } + #[test] + fn test_mismatched_lengths() { + let mut fp = FastqParser::new(b"@test\nAGCT\n+\nIII\n@TEST\nA\n+\nI", true).unwrap(); + let result = fp.next().unwrap(); + assert!(result.is_err()); + let e = result.unwrap_err(); + assert!(e.error_type == ParseErrorType::InvalidRecord); + assert!(e.msg == "Sequence and quality lengths differed"); + + let mut fp = FastqParser::new(b"@test\nAGCT\n+\nIIIII\n@TEST\nA\n+\nI", true).unwrap(); + let result = fp.next().unwrap(); + assert!(result.is_err()); + let e = result.unwrap_err(); + assert!(e.error_type == ParseErrorType::InvalidRecord); + assert!(e.msg == "Sequence and quality lengths differed"); + } + #[test] fn test_fastq_across_buffer() { let test_seq = b"@A\nA\n+A\nA\n@B\nA\n+B\n!"; let mut cursor = Cursor::new(test_seq); // the buffer is aligned to the first record - let mut rec_reader = RecBuffer::::new(&mut cursor, 9, b"").unwrap(); + let mut rec_reader = RecBuffer::new(&mut cursor, 9, b"").unwrap(); let used = { - let mut rec_buffer = rec_reader.get_reader(); + let mut rec_buffer = FastqParser::from_buffer(&rec_reader.buf, rec_reader.last); for _s in rec_buffer.by_ref() { // record is incomplete panic!("No initial record should be parsed") @@ -354,7 +415,7 @@ mod test { assert_eq!(rec_reader.refill(used).unwrap(), false); // now we should see both records - let mut rec_buffer = rec_reader.get_reader(); + let mut rec_buffer = FastqParser::from_buffer(&rec_reader.buf, rec_reader.last); // there should be a record assuming the parser // handled the buffer boundary diff --git a/src/formats/mod.rs b/src/formats/mod.rs index 1efa428..9f2b20b 100644 --- a/src/formats/mod.rs +++ b/src/formats/mod.rs @@ -36,8 +36,8 @@ use crate::util::{ParseError, ParseErrorType}; macro_rules! parse_stream { ($reader:expr, $first:expr, $reader_type: ty, $rec: ident, $handler: block) => {{ use $crate::formats::{RecBuffer, RecParser}; - let mut buffer = RecBuffer::<$reader_type>::new($reader, 1_000_000, &$first)?; - let mut rec_reader = buffer.get_reader(); + let mut buffer = RecBuffer::new($reader, 500_000, &$first)?; + let mut rec_reader = <$reader_type>::from_buffer(&buffer.buf, buffer.last); // TODO: do something with the header? let mut record_count: usize = 0; rec_reader.header().map_err(|e| e.record(record_count))?; @@ -45,7 +45,7 @@ macro_rules! parse_stream { if !buffer.refill(used).map_err(|e| e.record(record_count))? { loop { let used = { - let mut rec_reader = buffer.get_reader(); + let mut rec_reader = <$reader_type>::from_buffer(&buffer.buf, buffer.last); for s in rec_reader.by_ref() { record_count += 1; let $rec = s.map_err(|e| e.record(record_count))?; @@ -58,7 +58,7 @@ macro_rules! parse_stream { } } } - let rec_reader = buffer.get_reader(); + let rec_reader = <$reader_type>::from_buffer(&buffer.buf, buffer.last); rec_reader.eof().map_err(|e| e.record(record_count + 1))?; }}; } diff --git a/src/kmer.rs b/src/kmer.rs index 48b31cf..567760f 100644 --- a/src/kmer.rs +++ b/src/kmer.rs @@ -192,7 +192,7 @@ impl<'a> Iterator for NuclKmer<'a> { } else { Some((pos, rc_result, true)) } - }, + } } } } diff --git a/src/lib.rs b/src/lib.rs index 23f25a5..146243d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,4 +6,4 @@ pub mod seq; mod util; pub use formats::parse_sequences; -pub use util::ParseError; +pub use util::{ParseError, ParseErrorType}; diff --git a/src/seq.rs b/src/seq.rs index 15af392..7a79e72 100644 --- a/src/seq.rs +++ b/src/seq.rs @@ -77,7 +77,7 @@ fn test_normalize() { assert_eq!(normalize(b"ACGTU", false), Some(b"ACGTT".to_vec())); assert_eq!(normalize(b"acgtu", false), Some(b"ACGTT".to_vec())); - assert_eq!(normalize(b"N.N-N~N N", false), Some(b"N.N.N.N.N".to_vec())); + assert_eq!(normalize(b"N.N-N~N N", false), Some(b"N-N-N-NN".to_vec())); assert_eq!(normalize(b"BDHVRYSWKM", true), None); assert_eq!(normalize(b"bdhvryswkm", true), Some(b"BDHVRYSWKM".to_vec())); @@ -227,15 +227,14 @@ fn test_quality_mask() { qual: Some(b"AAA0"[..].into()), rev_seq: None, }; - let filtered_rec = seq_rec.quality_mask('5' as u8); + let filtered_rec = seq_rec.quality_mask(b'5'); assert_eq!(&filtered_rec.seq[..], &b"AGCN"[..]); } #[test] fn can_kmerize() { // test general function - let mut i = 0; - for (_, k, _) in Sequence::from_bytes(b"AGCT").kmers(1, false) { + for (i, (_, k, _)) in Sequence::from_bytes(b"AGCT").kmers(1, false).enumerate() { match i { 0 => assert_eq!(k, &b"A"[..]), 1 => assert_eq!(k, &b"G"[..]), @@ -243,31 +242,26 @@ fn can_kmerize() { 3 => assert_eq!(k, &b"T"[..]), _ => unreachable!("Too many kmers"), } - i += 1; } // test that we skip over N's - i = 0; - for (_, k, _) in Sequence::from_bytes(b"ACNGT").kmers(2, false) { + for (i, (_, k, _)) in Sequence::from_bytes(b"ACNGT").kmers(2, false).enumerate() { match i { 0 => assert_eq!(k, &b"AC"[..]), 1 => assert_eq!(k, &b"GT"[..]), _ => unreachable!("Too many kmers"), } - i += 1; } // test that we skip over N's and handle short kmers - i = 0; - for (ix, k, _) in Sequence::from_bytes(b"ACNG").kmers(2, false) { + for (i, (ix, k, _)) in Sequence::from_bytes(b"ACNG").kmers(2, false).enumerate() { match i { 0 => { assert_eq!(ix, 0); assert_eq!(k, &b"AC"[..]); - }, + } _ => unreachable!("Too many kmers"), } - i += 1; } // test that the minimum length works @@ -279,32 +273,29 @@ fn can_kmerize() { #[test] fn can_canonicalize() { // test general function - let mut i = 0; - for (_, k, is_c) in Sequence::from_bytes(b"AGCT").kmers(1, true) { + for (i, (_, k, is_c)) in Sequence::from_bytes(b"AGCT").kmers(1, true).enumerate() { match i { 0 => { assert_eq!(k, &b"A"[..]); assert_eq!(is_c, false); - }, + } 1 => { assert_eq!(k, &b"C"[..]); assert_eq!(is_c, true); - }, + } 2 => { assert_eq!(k, &b"C"[..]); assert_eq!(is_c, false); - }, + } 3 => { assert_eq!(k, &b"A"[..]); assert_eq!(is_c, true); - }, + } _ => unreachable!("Too many kmers"), } - i += 1; } - let mut i = 0; - for (_, k, _) in Sequence::from_bytes(b"AGCTA").kmers(2, true) { + for (i, (_, k, _)) in Sequence::from_bytes(b"AGCTA").kmers(2, true).enumerate() { match i { 0 => assert_eq!(k, &b"AG"[..]), 1 => assert_eq!(k, &b"GC"[..]), @@ -312,22 +303,19 @@ fn can_canonicalize() { 3 => assert_eq!(k, &b"TA"[..]), _ => unreachable!("Too many kmers"), } - i += 1; } - let mut i = 0; - for (ix, k, _) in Sequence::from_bytes(b"AGNTA").kmers(2, true) { + for (i, (ix, k, _)) in Sequence::from_bytes(b"AGNTA").kmers(2, true).enumerate() { match i { 0 => { assert_eq!(ix, 0); assert_eq!(k, &b"AG"[..]); - }, + } 1 => { assert_eq!(ix, 3); assert_eq!(k, &b"TA"[..]); - }, + } _ => unreachable!("Too many kmers"), } - i += 1; } } diff --git a/src/util.rs b/src/util.rs index f7e7c9a..78267ee 100644 --- a/src/util.rs +++ b/src/util.rs @@ -98,11 +98,11 @@ pub fn strip_whitespace(seq: &[u8]) -> Cow<[u8]> { None => { new_buf.extend_from_slice(&seq[i..]); break; - }, + } Some(match_pos) => { new_buf.extend_from_slice(&seq[i..i + match_pos]); i += match_pos + 1; - }, + } } } Cow::Owned(new_buf) diff --git a/tests/format_specimens.rs b/tests/format_specimens.rs new file mode 100644 index 0000000..25aabdb --- /dev/null +++ b/tests/format_specimens.rs @@ -0,0 +1,127 @@ +use std::io::Read; + +use needletail::formats::{FastaParser, FastqParser, RecParser}; +use needletail::{ParseError, ParseErrorType}; +use reqwest::get; +use serde_derive::Deserialize; +use toml; + +#[derive(Deserialize)] +struct TestCase { + filename: String, + // origin: String, + tags: Option>, + // comments: Option>, +} + +#[derive(Deserialize)] +struct TestIndex { + valid: Vec, + invalid: Option>, +} + +fn test_fasta_file(reader: &mut dyn Read, filename: &str) -> Result<(), ParseError> { + let mut data: Vec = Vec::new(); + let _ = reader.read_to_end(&mut data)?; + + let parser = FastaParser::new(&data, true) + .unwrap_or_else(|_| panic!("Can not open test data: {}", filename)); + let record_number = 0; + for record in parser { + let _ = record.map_err(|e| e.record(record_number))?; + } + Ok(()) +} + +#[test] +#[ignore] +fn test_specimen_fasta() { + let base_path = "https://raw.githubusercontent.com/BioJulia/FormatSpecimens.jl/master/FASTA"; + let idx_path = format!("{}/index.toml", base_path); + let raw_index = get(&idx_path) + .expect("Could not retrieve index") + .text() + .expect("Could not decode index"); + + let index: TestIndex = toml::from_str(&raw_index).expect("Could not deserialize index"); + for test in index.valid { + // what kind of sicko puts comments in FASTAs? + if test + .tags + .unwrap_or_else(Vec::new) + .contains(&String::from("comments")) + { + continue; + } + + let test_path = format!("{}/{}", base_path, test.filename); + let mut test_reader = get(&test_path).expect("Could not retrieve test data"); + assert_eq!(test_fasta_file(&mut test_reader, &test.filename), Ok(())); + } +} + +fn test_fastq_file(reader: &mut dyn Read, filename: &str) -> Result<(), ParseError> { + let mut data: Vec = Vec::new(); + let _ = reader.read_to_end(&mut data)?; + + let mut parser = FastqParser::new(&data, true) + .unwrap_or_else(|_| panic!("Can not open test data: {}", filename)); + let record_number = 0; + for record in parser.by_ref() { + let rec = record.map_err(|e| e.record(record_number))?; + if !rec + .qual + .iter() + .all(|c| (*c >= b'!' && *c <= b'~') || *c == b'\n') + { + return Err(ParseError::new( + "FASTQ has bad quality scores", + ParseErrorType::Invalid, + )); + } + } + parser.eof()?; + Ok(()) +} + +#[test] +#[ignore] +fn test_specimen_fastq() { + let base_path = "https://raw.githubusercontent.com/BioJulia/FormatSpecimens.jl/master/FASTQ/"; + let idx_path = format!("{}/index.toml", base_path); + let raw_index = get(&idx_path) + .expect("Could not retrieve index") + .text() + .expect("Could not decode index"); + + let index: TestIndex = toml::from_str(&raw_index).expect("Could not deserialize index"); + + for test in index.valid { + if test.filename == "wrapping_original_sanger.fastq" { + // may god have mercy upon us if someone ever tries a file like this + // (sequences are one-line, but quality scores are line-wrapped) + continue; + } + let test_path = format!("{}/{}", base_path, test.filename); + let mut test_reader = get(&test_path).expect("Could not retrieve test data"); + assert_eq!( + test_fastq_file(&mut test_reader, &test.filename), + Ok(()), + "File {} is bad?", + test.filename + ); + } + + for test in index.invalid.unwrap_or_else(Vec::new) { + if test.filename == "error_diff_ids.fastq" { + // we don't care if the sequence ID doesn't match the quality id? + continue; + } + let test_path = format!("{}/{}", base_path, test.filename); + let mut test_reader = get(&test_path).expect("Could not retrieve test data"); + assert!( + test_fastq_file(&mut test_reader, &test.filename).is_err(), + format!("File {} is good?", test.filename) + ); + } +}