From 7b35ce8cbcf96d07649bf38a2db097ea12546f9b Mon Sep 17 00:00:00 2001 From: Roderick Bovee Date: Wed, 4 Sep 2019 10:33:33 -0700 Subject: [PATCH] Use criterion and otherwise cleanup/simplify benchmarking --- Cargo.toml | 6 +- bench/benchmark.c | 27 --- bench/benchmark.py | 52 ------ bench/benchmark.rs | 20 --- bench/benchmark_biopython.py | 9 - benches/benchmark.rs | 340 ++++++++++++++++++++++------------- src/formats/buffer.rs | 63 +------ src/formats/fasta.rs | 24 ++- src/formats/fastq.rs | 28 ++- src/formats/mod.rs | 3 +- src/util.rs | 34 ++++ 11 files changed, 282 insertions(+), 324 deletions(-) delete mode 100644 bench/benchmark.c delete mode 100755 bench/benchmark.py delete mode 100644 bench/benchmark.rs delete mode 100644 bench/benchmark_biopython.py diff --git a/Cargo.toml b/Cargo.toml index 8f776c2..8871409 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,10 +18,12 @@ compression = ["bzip2", "flate2", "xz2"] flate2 = { version="1.0.6", optional=true } bzip2 = { version="0.3.3", optional=true } xz2 = { version="0.1.6", optional=true } -memchr = "2.2.0" +memchr = "2.2.1" [dev-dependencies] -bencher = "0.1.5" +criterion = "0.3" +bio = "0.28" +seq_io = "0.3" [[bench]] name = "benchmark" diff --git a/bench/benchmark.c b/bench/benchmark.c deleted file mode 100644 index 8618ddf..0000000 --- a/bench/benchmark.c +++ /dev/null @@ -1,27 +0,0 @@ -// using https://github.com/lh3/readfq for parsing -// derived in part from http://stackoverflow.com/questions/19390245/how-to-parse-a-fasta-file-using-kseq-h -#include -#include -#include -#include "./readfq/kseq.h" -KSEQ_INIT(FILE*, read) - -int main(int argc, char* argv[]) { - FILE* fp = fopen(argv[1], "r"); - if (fp == 0) { - perror("fopen"); - exit(1); - } - kseq_t *seq = kseq_init(fileno(fp)); - - unsigned long slen = 0; - while (kseq_read(seq) >= 0) { - slen += seq->seq.l; - } - - printf("%lu\n", slen); - - kseq_destroy(seq); - fclose(fp); - return (0); -} diff --git a/bench/benchmark.py b/bench/benchmark.py deleted file mode 100755 index ada274f..0000000 --- a/bench/benchmark.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python -""" -Unscientific benchmarking of this versus the --release rust -implementation below using the %timeit Ipython magic (times in sec) - - n_kmers, py_runtime, rust_runtime - 6594204, 14.4, 0.578 - -Both give identical counts on the files tested (and printing kmers out -and diff'ing the results gives no difference) -""" -from __future__ import print_function -import sys - -from Bio.SeqIO import parse -from Bio.Seq import reverse_complement - - -def slid_win(seq, size=4, overlapping=True): - """Returns a sliding window along self.seq.""" - itr = iter(seq) - if overlapping: - buf = '' - for _ in range(size): - buf += next(itr) - for l in itr: - yield buf - buf = buf[1:] + l - yield buf - else: - buf = '' - for l in itr: - buf += l - if len(buf) == size: - yield buf - buf = '' - -filename = sys.argv[1] - -n_total = 0 -n_canonical = 0 - -for s in parse(filename, 'fastq'): - uppercase_seq = str(s.upper().seq) - for kmer in slid_win(uppercase_seq, 4): - canonical = min(kmer, reverse_complement(kmer)) - if canonical == 'CAGC': - n_canonical += 1 - n_total += 1 - # print(canonical) - -print(n_total, n_canonical) diff --git a/bench/benchmark.rs b/bench/benchmark.rs deleted file mode 100644 index 699d45c..0000000 --- a/bench/benchmark.rs +++ /dev/null @@ -1,20 +0,0 @@ -extern crate needletail; - -use std::env; -use needletail::{fastx, kmer}; - - -fn count_bases(filename: String) { - let mut n_bases = 0; - fastx::fastx_cli(&filename[..], |_| (), |seq| { - n_bases += seq.seq.len(); - }).unwrap(); - println!("{:?}", n_bases); -} - - -fn main() { - let filename: String = env::args().nth(1).unwrap(); - - count_bases(filename); -} diff --git a/bench/benchmark_biopython.py b/bench/benchmark_biopython.py deleted file mode 100644 index 82d8c2f..0000000 --- a/bench/benchmark_biopython.py +++ /dev/null @@ -1,9 +0,0 @@ -from Bio import SeqIO -import sys - -total = 0 -with open(sys.argv[1], "rU") as handle: - for record in SeqIO.parse(handle, "fastq"): - total += len(record.seq) - -print(total) diff --git a/benches/benchmark.rs b/benches/benchmark.rs index 0ce6861..d8b5223 100644 --- a/benches/benchmark.rs +++ b/benches/benchmark.rs @@ -1,8 +1,8 @@ #[macro_use] -extern crate bencher; +extern crate criterion; extern crate needletail; -use bencher::Bencher; +use criterion::Criterion; use needletail::parse_sequences; use std::fs::File; use std::io::{Cursor, Read}; @@ -10,143 +10,241 @@ use std::io::{Cursor, Read}; // from Bio.SeqIO import parse // n_total = sum([len([k for k in slid_win(i.seq, 31) if set(k).issubset({'A', 'C', 'G', 'T'})]) for i in SeqIO.parse('./tests/data/28S.fasta', 'fasta')]) -fn bench_kmer_speed(bench: &mut Bencher) { - let filename = "tests/data/28S.fasta"; +fn bench_kmer_speed(c: &mut Criterion) { let ksize = 31; - bench.iter(|| { - let mut n_total = 0; - let mut n_canonical = 0; - let file = File::open(filename).unwrap(); - parse_sequences( - file, - |_| {}, - |seq| { - for (_, _kmer, was_rc) in seq.normalize(true).kmers(ksize, true) { - if !was_rc { - n_canonical += 1; - } - n_total += 1; - } - }, - ) - .unwrap(); - assert_eq!(718007, n_total); - assert_eq!(350983, n_canonical); - }) -} - -fn bench_bitkmer_speed(bench: &mut Bencher) { - let filename = "tests/data/28S.fasta"; - let ksize = 31; + let mut data: Vec = vec![]; + let mut f = File::open("tests/data/28S.fasta").unwrap(); + let _ = f.read_to_end(&mut data); - bench.iter(|| { - let mut n_total = 0; - let mut n_canonical = 0; - let file = File::open(filename).unwrap(); - parse_sequences( - file, - |_| {}, - |seq| { - for (_, _kmer, was_rc) in seq.bit_kmers(ksize, true) { - if !was_rc { - n_canonical += 1; + let mut group = c.benchmark_group("Kmerizing"); + group.sample_size(10); + + group.bench_function("Kmer", |b| { + b.iter(|| { + let mut n_total = 0; + let mut n_canonical = 0; + let fasta_data = Cursor::new(data.clone()); + parse_sequences( + fasta_data, + |_| {}, + |seq| { + for (_, _kmer, was_rc) in seq.normalize(true).kmers(ksize, true) { + if !was_rc { + n_canonical += 1; + } + n_total += 1; } - n_total += 1; - } - }, - ) - .unwrap(); - assert_eq!(718007, n_total); - assert_eq!(350983, n_canonical); - }) + }, + ) + .unwrap(); + assert_eq!(718007, n_total); + assert_eq!(350983, n_canonical); + }); + }); + + group.bench_function("Bitkmer", |bench| { + bench.iter(|| { + let mut n_total = 0; + let mut n_canonical = 0; + let fasta_data = Cursor::new(data.clone()); + parse_sequences( + fasta_data, + |_| {}, + |seq| { + for (_, _kmer, was_rc) in seq.bit_kmers(ksize, true) { + if !was_rc { + n_canonical += 1; + } + n_total += 1; + } + }, + ) + .unwrap(); + assert_eq!(718007, n_total); + assert_eq!(350983, n_canonical); + }); + }); } -benchmark_group!(kmers, bench_kmer_speed, bench_bitkmer_speed); +criterion_group!(kmers, bench_kmer_speed); + +fn bench_fastq_file(c: &mut Criterion) { + use bio::io::fastq as bio_fastq; + use seq_io::fastq as seq_fastq; + use seq_io::fastq::Record; -fn bench_fastq_bytes(bench: &mut Bencher) { let mut data: Vec = vec![]; let mut f = File::open("tests/data/PRJNA271013_head.fq").unwrap(); let _ = f.read_to_end(&mut data); - bench.iter(|| { - let mut n_bases = 0; - parse_sequences( - Cursor::new(&data), - |_| {}, - |seq| { - n_bases += seq.seq.len(); - }, - ) - .unwrap(); - assert_eq!(250000, n_bases); - }) -} - -fn bench_fastq_file(bench: &mut Bencher) { - let filename = "tests/data/PRJNA271013_head.fq"; - - // warming up the cache doesn't seem to make the timings more repeatable? - // fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap(); - bench.iter(|| { - let mut n_bases = 0; - parse_sequences( - File::open(filename).unwrap(), - |_| {}, - |seq| { - n_bases += seq.seq.len(); - }, - ) - .unwrap(); - assert_eq!(250000, n_bases); - }) + let mut group = c.benchmark_group("FASTQ parsing"); + + group.bench_function("RustBio", |bench| { + bench.iter(|| { + let fastq_data = Cursor::new(data.clone()); + let reader = bio_fastq::Reader::new(fastq_data); + let mut n_bases = 0; + for record in reader.records() { + let record = record.unwrap(); + n_bases += record.seq().len() + } + assert_eq!(250000, n_bases); + }); + }); + + group.bench_function("SeqIO", |bench| { + bench.iter(|| { + let fastq_data = Cursor::new(data.clone()); + let mut reader = seq_fastq::Reader::new(fastq_data); + let mut n_bases = 0; + while let Some(result) = reader.next() { + let record = result.unwrap(); + let seqlen = record.seq().len(); + n_bases += seqlen; + } + assert_eq!(250000, n_bases); + }); + }); + + group.bench_function("Needletail", |bench| { + bench.iter(|| { + let fastq_data = Cursor::new(data.clone()); + let mut n_bases = 0; + parse_sequences( + fastq_data, + |_| {}, + |seq| { + n_bases += seq.seq.len(); + }, + ) + .unwrap(); + assert_eq!(250000, n_bases); + }); + }); + + group.bench_function("Needletail (Macro)", |bench| { + use needletail::formats::FastqReader; + use needletail::{parse_stream, ParseError}; + #[inline] + fn get_n_bases(mut fastq_data: &mut dyn Read) -> Result { + let mut n_bases = 0; + parse_stream!(&mut fastq_data, &b""[..], FastqReader, rec, { + n_bases += rec.seq.len(); + }); + Ok(n_bases) + } + + bench.iter(|| { + let mut fastq_data = Cursor::new(data.clone()); + let n_bases = get_n_bases(&mut fastq_data).unwrap(); + assert_eq!(250000, n_bases); + }); + }); + + group.bench_function("Needletail (No Buffer)", |bench| { + use needletail::formats::{FastqReader, RecReader}; + bench.iter(|| { + let mut reader = FastqReader::from_buffer(&data, true); + let mut n_bases = 0; + for seq in reader.by_ref() { + n_bases += seq.unwrap().seq.len(); + } + assert_eq!(250000, n_bases); + }); + }); } -fn bench_fasta_bytes(bench: &mut Bencher) { - let filename = String::from("tests/data/28S.fasta"); +fn bench_fasta_file(c: &mut Criterion) { + use bio::io::fasta as bio_fasta; + use seq_io::fasta as seq_fasta; let mut data: Vec = vec![]; - let mut f = File::open(filename).unwrap(); + let mut f = File::open("tests/data/28S.fasta").unwrap(); let _ = f.read_to_end(&mut data); - bench.iter(|| { - let mut n_bases = 0; - parse_sequences( - Cursor::new(&data), - |_| {}, - |seq| { + let mut group = c.benchmark_group("FASTA parsing"); + + group.bench_function("RustBio", |bench| { + bench.iter(|| { + let fasta_data = Cursor::new(data.clone()); + let reader = bio_fasta::Reader::new(fasta_data); + let mut n_bases = 0; + for record in reader.records() { + let record = record.unwrap(); + n_bases += record.seq().len() + } + assert_eq!(738580, n_bases); + }); + }); + + group.bench_function("SeqIO", |bench| { + bench.iter(|| { + let fasta_data = Cursor::new(data.clone()); + let mut reader = seq_fasta::Reader::new(fasta_data); + let mut n_bases = 0; + while let Some(result) = reader.next() { + let record = result.unwrap(); + for s in record.seq_lines() { + n_bases += s.len(); + } + } + assert_eq!(738580, n_bases); + }); + }); + + group.bench_function("Needletail", |bench| { + bench.iter(|| { + let fasta_data = Cursor::new(data.clone()); + let mut n_bases = 0; + parse_sequences( + fasta_data, + |_| {}, + |seq| { + n_bases += seq.seq.len(); + }, + ) + .unwrap(); + assert_eq!(738580, n_bases); + }); + }); + + group.bench_function("Needletail (Macro)", |bench| { + use needletail::formats::FastaReader; + use needletail::seq::Sequence; + use needletail::{parse_stream, ParseError}; + #[inline] + fn get_n_bases(mut fasta_data: &mut dyn Read) -> Result { + let mut n_bases = 0; + parse_stream!(&mut fasta_data, &b""[..], FastaReader, rec, { + let seq = Sequence::from(rec); n_bases += seq.seq.len(); - }, - ) - .unwrap(); - assert_eq!(738580, n_bases); - }) -} - -fn bench_fasta_file(bench: &mut Bencher) { - let filename = "tests/data/28S.fasta"; - - // warming up the cache doesn't seem to make the timings more repeatable? - // fastx::fastx_file(&filename[..], |seq| { assert!(seq.1.len() > 0) }).unwrap(); - bench.iter(|| { - let mut n_bases = 0; - parse_sequences( - File::open(filename).unwrap(), - |_| {}, - |seq| { + }); + Ok(n_bases) + } + + bench.iter(|| { + let mut fasta_data = Cursor::new(data.clone()); + let n_bases = get_n_bases(&mut fasta_data).unwrap(); + assert_eq!(738580, n_bases); + }); + }); + + group.bench_function("Needletail (No Buffer)", |bench| { + use needletail::seq::Sequence; + use needletail::formats::{FastaReader, RecReader}; + bench.iter(|| { + let mut reader = FastaReader::from_buffer(&data, true); + let mut n_bases = 0; + for rec in reader.by_ref() { + let seq = Sequence::from(rec.unwrap()); n_bases += seq.seq.len(); - }, - ) - .unwrap(); - assert_eq!(738580, n_bases); - }) + } + assert_eq!(738580, n_bases); + }); + }); } -benchmark_group!( - fastx, - bench_fastq_bytes, - bench_fastq_file, - bench_fasta_bytes, - bench_fasta_file -); -benchmark_main!(kmers, fastx); +criterion_group!(io, bench_fasta_file, bench_fastq_file); + +criterion_main!(kmers, io); diff --git a/src/formats/buffer.rs b/src/formats/buffer.rs index 19705cc..280a3ed 100644 --- a/src/formats/buffer.rs +++ b/src/formats/buffer.rs @@ -91,7 +91,7 @@ where } pub fn get_reader(&'b self) -> T { - T::from_buffer(self) + T::from_buffer(&self.buf, self.last) } } @@ -104,67 +104,8 @@ where pub trait RecReader<'s>: Sized + Iterator { type Header; - fn from_buffer(reader: &'s RecBuffer) -> Self; + fn from_buffer(buf: &'s [u8], last: bool) -> Self; fn header(&mut self) -> Result; fn eof(&self) -> Result<(), ParseError>; fn used(&self) -> usize; } - -// #[derive(Debug)] -// pub struct RecBuffer<'a, T> { -// record_type: PhantomData, -// pub buf: &'a [u8], -// pub pos: usize, -// pub last: bool, -// pub count: usize, -// } -// -// impl<'a, T> RecBuffer<'a, T> { -// pub fn from_bytes(data: &'a [u8]) -> Self { -// RecBuffer { -// buf: data, -// pos: 0, -// last: true, -// record_type: PhantomData, -// count: 0, -// } -// } -// -// pub fn used(&self) -> (usize, usize) { -// (self.pos, self.count) -// } -// } -// -// #[test] -// fn test_from_bytes() { -// // this is not a useful test, but it does get the compiler to shut up -// // about `from_bytes` not being used -// let rb: RecBuffer = RecBuffer::from_bytes(b"test"); -// assert_eq!(rb.pos, 0); -// assert_eq!(rb.buf, b"test"); -// } - -// pub fn parse(reader: &'s mut io::Read, header: &[u8], ref mut callback: F) -> Result<(), E> where -// E: From, -// F: FnMut(T) -> Result<(), E>, -// for<'s> RecBuffer<'s, T>: Iterator>, -// { -// let mut rec_reader = RecReader::new(reader, 10_000_000, header)?; -// loop { -// let used = { -// let mut rec_buffer = rec_reader.get_buffer(); -// for s in rec_buffer.by_ref() { -// callback(s?)?; -// } -// rec_buffer.pos -// }; -// if rec_reader.refill(used)? { -// break; -// } -// } -// if rec_reader.get_buffer::().is_finished(true) { -// Ok(()) -// } else { -// Err(ParseError::PrematureEOF.into()) -// } -// } diff --git a/src/formats/fasta.rs b/src/formats/fasta.rs index 7c323b0..6f0251c 100644 --- a/src/formats/fasta.rs +++ b/src/formats/fasta.rs @@ -2,9 +2,9 @@ use std::io::Write; use memchr::memchr; -use crate::formats::buffer::{RecBuffer, RecReader}; +use crate::formats::buffer::RecReader; use crate::seq::Sequence; -use crate::util::{memchr_both, strip_whitespace, ParseError, ParseErrorType}; +use crate::util::{memchr_both_last, strip_whitespace, ParseError, ParseErrorType}; #[derive(Debug)] pub struct Fasta<'a> { @@ -17,11 +17,11 @@ impl<'a> Fasta<'a> { where W: Write, { - writer.write(b">")?; - writer.write(&self.id)?; - writer.write(b"\n")?; - writer.write(&self.seq)?; - writer.write(b"\n")?; + writer.write_all(b">")?; + writer.write_all(&self.id)?; + writer.write_all(b"\n")?; + writer.write_all(&self.seq)?; + writer.write_all(b"\n")?; Ok(()) } } @@ -78,7 +78,7 @@ impl<'a> Iterator for FastaReader<'a> { } let seq_end; - match (memchr_both(b'\n', b'>', &buf[id_end..]), self.last) { + match (memchr_both_last(b'\n', b'>', &buf[id_end..]), self.last) { (Some(i), _) => seq_end = id_end + i + 1, (None, true) => seq_end = buf.len(), (None, false) => return None, @@ -104,12 +104,8 @@ impl<'a> Iterator for FastaReader<'a> { impl<'a> RecReader<'a> for FastaReader<'a> { type Header = (); - fn from_buffer<'s>(reader: &'s RecBuffer) -> FastaReader<'s> { - FastaReader { - buf: &reader.buf, - last: reader.last, - pos: 0, - } + fn from_buffer(buf: &[u8], last: bool) -> FastaReader { + FastaReader { buf, last, pos: 0 } } fn header(&mut self) -> Result { diff --git a/src/formats/fastq.rs b/src/formats/fastq.rs index ceb22f5..b3b8b58 100644 --- a/src/formats/fastq.rs +++ b/src/formats/fastq.rs @@ -4,7 +4,7 @@ use std::io::Write; use memchr::memchr; -use crate::formats::buffer::{RecBuffer, RecReader}; +use crate::formats::buffer::RecReader; use crate::formats::fasta::check_end; use crate::seq::Sequence; use crate::util::{memchr_both, ParseError, ParseErrorType}; @@ -22,17 +22,17 @@ impl<'a> Fastq<'a> { where W: Write, { - writer.write(b"@")?; - writer.write(&self.id)?; - writer.write(b"\n")?; - writer.write(&self.seq)?; - writer.write(b"+\n")?; + writer.write_all(b"@")?; + writer.write_all(&self.id)?; + writer.write_all(b"\n")?; + writer.write_all(&self.seq)?; + writer.write_all(b"+\n")?; if self.seq.len() != self.qual.len() { - writer.write(&vec![b'I'; self.seq.len()])?; + writer.write_all(&vec![b'I'; self.seq.len()])?; } else { - writer.write(&self.qual)?; + writer.write_all(&self.qual)?; } - writer.write(b"\n")?; + writer.write_all(b"\n")?; Ok(()) } } @@ -58,7 +58,7 @@ impl<'a> From<&'a Sequence<'a>> for Fastq<'a> { id: &seq.id, seq: &seq.seq, id2: b"", - qual: qual, + qual, } } } @@ -152,12 +152,8 @@ impl<'a> Iterator for FastqReader<'a> { impl<'a> RecReader<'a> for FastqReader<'a> { type Header = (); - fn from_buffer<'s>(reader: &'s RecBuffer) -> FastqReader<'s> { - FastqReader { - buf: &reader.buf, - last: reader.last, - pos: 0, - } + fn from_buffer(buf: &[u8], last: bool) -> FastqReader { + FastqReader { buf, last, pos: 0 } } fn header(&mut self) -> Result { diff --git a/src/formats/mod.rs b/src/formats/mod.rs index d4c2227..4e347d6 100644 --- a/src/formats/mod.rs +++ b/src/formats/mod.rs @@ -32,12 +32,11 @@ pub use crate::formats::fastq::{Fastq, FastqReader}; use crate::seq::Sequence; use crate::util::{ParseError, ParseErrorType}; - #[macro_export] macro_rules! parse_stream { ($reader:expr, $first:expr, $reader_type: ty, $rec: ident, $handler: block) => {{ use $crate::formats::{RecBuffer, RecReader}; - let mut buffer = RecBuffer::<$reader_type>::new($reader, 10_000_000, &$first)?; + let mut buffer = RecBuffer::<$reader_type>::new($reader, 1_000_000, &$first)?; let mut rec_reader = buffer.get_reader(); // TODO: do something with the header? let mut record_count: usize = 0; diff --git a/src/util.rs b/src/util.rs index 5608c54..97b3f26 100644 --- a/src/util.rs +++ b/src/util.rs @@ -142,3 +142,37 @@ fn test_memchr_both() { let pos = memchr_both(b'\n', b'-', &b"te\nst\n-this"[..]); assert_eq!(pos, Some(5)); } + +#[inline] +pub fn memchr_both_last(b1: u8, b2: u8, seq: &[u8]) -> Option { + // TODO: b2 is going to be much rarer for us in FASTAs, so we should search for that instead + // (but b1 is going to be the rarer character in FASTQs so this is optimized for that and we + // should allow a choice between the two) + let mut pos = 0; + loop { + match memchr(b2, &seq[pos..]) { + None => return None, + Some(match_pos) => { + if match_pos == 0 { + pos += 1; + } else if seq[pos + match_pos - 1] == b1 { + return Some(pos + match_pos - 1); + } else { + pos += match_pos + 1; + } + }, + } + } +} + +#[test] +fn test_memchr_both_last() { + let pos = memchr_both_last(b'\n', b'-', &b"test\n-this"[..]); + assert_eq!(pos, Some(4)); + + let pos = memchr_both_last(b'\n', b'-', &b"te\nst\n-this"[..]); + assert_eq!(pos, Some(5)); + + let pos = memchr_both_last(b'\n', b'-', &b"-te\nst\n-this"[..]); + assert_eq!(pos, Some(6)); +}