From 253ff621d828b785be0be73461aa278d18acc3eb Mon Sep 17 00:00:00 2001 From: sharkLoc Date: Tue, 21 May 2024 09:31:10 +0800 Subject: [PATCH 1/2] add gzip/xz/bzip2 support for input --- Cargo.lock | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 3 ++ src/main.rs | 45 ++++++---------------- src/utils.rs | 95 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 215 insertions(+), 34 deletions(-) create mode 100644 src/utils.rs diff --git a/Cargo.lock b/Cargo.lock index 7dcb82b..3e8f1fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -80,6 +80,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.2.0" @@ -190,6 +201,27 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "cc" version = "1.0.94" @@ -207,11 +239,14 @@ name = "chopper" version = "0.8.0" dependencies = [ "approx", + "atty", "bio", + "bzip2", "clap", "flate2", "minimap2", "rayon", + "xz2", ] [[package]] @@ -254,6 +289,15 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +[[package]] +name = "cmake" +version = "0.1.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.0" @@ -389,6 +433,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", + "libz-ng-sys", "miniz_oxide", ] @@ -430,6 +475,15 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + [[package]] name = "indexmap" version = "2.2.6" @@ -482,6 +536,16 @@ version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +[[package]] +name = "libz-ng-sys" +version = "1.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6409efc61b12687963e602df8ecf70e8ddacf95bc6576bcf16e3ac6328083c5" +dependencies = [ + "cmake", + "libc", +] + [[package]] name = "libz-sys" version = "1.1.16" @@ -494,6 +558,17 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "matrixmultiply" version = "0.3.8" @@ -1012,6 +1087,28 @@ dependencies = [ "safe_arch", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + [[package]] name = "windows-sys" version = "0.52.0" @@ -1084,3 +1181,12 @@ name = "windows_x86_64_msvc" version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" + +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] diff --git a/Cargo.toml b/Cargo.toml index c37403f..f268dce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,3 +13,6 @@ rayon = "1.7.0" approx = "0.5.1" minimap2 = "0.1.17+minimap2.2.27" flate2 = { version = "1.0.17", features = ["zlib-ng"], default-features = false } +xz2 = "0.1.7" +bzip2 = "0.4.4" +atty = "0.2.14" diff --git a/src/main.rs b/src/main.rs index 24a29e8..cee9c7c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,12 +3,14 @@ use bio::io::fastq; use clap::Parser; use minimap2::*; use rayon::prelude::*; -use std::io::{self, Read, BufReader}; -use std::path::{PathBuf, Path}; +use std::io::Read; +use std::error::Error; +use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; -use std::fs::File; -use flate2::read::GzDecoder; + +mod utils; +use utils::file_reader; // The arguments end up in the Cli struct #[derive(Parser, Debug)] @@ -75,42 +77,17 @@ fn is_file(pathname: &str) -> Result<(), String> { } -fn main() { +fn main() -> Result<(), Box>{ let args = Cli::parse(); rayon::ThreadPoolBuilder::new() .num_threads(args.threads) .build_global() .expect("Error: Unable to build threadpool"); - match args.input { - // Process file if --input exist - Some(ref infile) => { - let path = Path::new(infile); - // make sure the input file is valid, if not shut down gracefully with an error message - if !path.exists() { - eprintln!("ERROR: Input file {} does not exist", infile); - std::process::exit(1); - } - if path.extension().and_then(|s| s.to_str()) == Some("gz") { - // deal with gz compressed file - let gzfile = File::open(&path).expect("Error: Unable to open gzipped file"); - let buf_reader = BufReader::with_capacity(512*1024, gzfile); - let mut decoder = GzDecoder::new(buf_reader); - filter(&mut decoder, args); - - } - else { - // deal with uncompressed fastq file - let mut input_file = File::open(infile).expect("Error: Unable to open fastq file"); - filter(&mut input_file, args); - } - } - - None => { - filter(&mut io::stdin(), args); - } - - } + let mut reader = file_reader(args.input.as_ref())?; + filter(&mut reader, args); + + Ok(()) } /// This function filters fastq on stdin based on quality, maxlength and minlength diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..5a90284 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,95 @@ +use std::{ + error::Error, + fs::File, + io::{self, prelude::*, BufRead, BufReader}, + path::Path, +}; + +const MAGIC_MAX_LEN: usize = 6; +const GZ_MAGIC: [u8; 3] = [0x1f, 0x8b, 0x08]; +const BZ_MAGIC: [u8; 3] = [0x42, 0x5a, 0x68]; +const XZ_MAGIC: [u8; 6] = [0xfd, 0x37, 0x7a, 0x58, 0x5A, 0x00]; +const BUFF_SIZE: usize = 512 * 1024; + +fn magic_num + Copy>(file_name: P) -> Result<[u8; MAGIC_MAX_LEN], Box> { + let mut buffer: [u8; MAGIC_MAX_LEN] = [0; MAGIC_MAX_LEN]; + let mut fp = File::open(file_name)?; + let _ = fp.read(&mut buffer)?; + Ok(buffer) +} + +fn is_gzipped + Copy>(file_name: P) -> Result> { + let buffer = magic_num(file_name)?; + let gz_or_not = + buffer[0] == GZ_MAGIC[0] && buffer[1] == GZ_MAGIC[1] && buffer[2] == GZ_MAGIC[2]; + Ok(gz_or_not + || file_name + .as_ref() + .extension() + .is_some_and(|ext| ext == "gz")) +} + +fn is_bzipped + Copy>(file_name: P) -> Result> { + let buffer = magic_num(file_name)?; + let bz_or_not = + buffer[0] == BZ_MAGIC[0] && buffer[1] == BZ_MAGIC[1] && buffer[2] == BZ_MAGIC[2]; + Ok(bz_or_not + || file_name + .as_ref() + .extension() + .is_some_and(|ext| ext == "bz2")) +} + +fn is_xz + Copy>(file_name: P) -> Result> { + let buffer = magic_num(file_name)?; + let xz_or_not = buffer[0] == XZ_MAGIC[0] + && buffer[1] == XZ_MAGIC[1] + && buffer[2] == XZ_MAGIC[2] + && buffer[3] == XZ_MAGIC[3] + && buffer[4] == XZ_MAGIC[4] + && buffer[5] == XZ_MAGIC[5]; + Ok(xz_or_not + || file_name + .as_ref() + .extension() + .is_some_and(|ext| ext == "xz")) +} + +pub fn file_reader

(file_in: Option

) -> Result, Box> +where + P: AsRef + Copy, +{ + if let Some(file_name) = file_in { + let gz_flag = is_gzipped(file_name)?; + let bz_flag = is_bzipped(file_name)?; + let zx_flag = is_xz(file_name)?; + + let fp = File::open(file_name)?; + + if gz_flag { + Ok(Box::new(BufReader::with_capacity( + BUFF_SIZE, + flate2::read::MultiGzDecoder::new(fp), + ))) + } else if bz_flag { + Ok(Box::new(BufReader::with_capacity( + BUFF_SIZE, + bzip2::read::MultiBzDecoder::new(fp), + ))) + } else if zx_flag { + Ok(Box::new(BufReader::with_capacity( + BUFF_SIZE, + xz2::read::XzDecoder::new_multi_decoder(fp), + ))) + } else { + Ok(Box::new(BufReader::with_capacity(BUFF_SIZE, fp))) + } + } else { + if atty::is(atty::Stream::Stdin) { + eprintln!("Error: stdin not detected"); + std::process::exit(1); + } + let fp = BufReader::new(io::stdin()); + Ok(Box::new(fp)) + } +} \ No newline at end of file From 92e1cb81b0a7d1a9a37bf38268ed585d73677ebf Mon Sep 17 00:00:00 2001 From: sharkLoc Date: Tue, 21 May 2024 09:33:18 +0800 Subject: [PATCH 2/2] run cargo fmt and clippy --- src/main.rs | 21 +++++++++++---------- src/utils.rs | 2 +- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/main.rs b/src/main.rs index cee9c7c..581033f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,8 +3,8 @@ use bio::io::fastq; use clap::Parser; use minimap2::*; use rayon::prelude::*; -use std::io::Read; use std::error::Error; +use std::io::Read; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -66,7 +66,6 @@ struct Cli { mingc: f64, } - fn is_file(pathname: &str) -> Result<(), String> { let path = PathBuf::from(pathname); if path.is_file() { @@ -76,8 +75,7 @@ fn is_file(pathname: &str) -> Result<(), String> { } } - -fn main() -> Result<(), Box>{ +fn main() -> Result<(), Box> { let args = Cli::parse(); rayon::ThreadPoolBuilder::new() .num_threads(args.threads) @@ -114,14 +112,14 @@ where if !record.is_empty() { let read_len = record.seq().len(); // If a read is shorter than what is to be cropped the read is dropped entirely (filtered out) - + // Check if gc content filter exist, if no gc content filter is set pass the 0.5 to pass all the follwoing filter let read_gc = if args.mingc != 0.0 || args.maxgc != 1.0 { cal_gc(record.seq()) } else { 0.5 }; - + if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( &record.qual().iter().map(|i| i - 33).collect::>(), @@ -173,7 +171,7 @@ where } else { 0.5 }; - + if args.headcrop + args.tailcrop < read_len { let average_quality = ave_qual( &record.qual().iter().map(|i| i - 33).collect::>(), @@ -260,7 +258,10 @@ fn is_contamination(readseq: &&[u8], contam: &Aligner) -> bool { } fn cal_gc(readseq: &[u8]) -> f64 { - let gc_count = readseq.iter().filter(|&&base| base == b'G' || base == b'g' || base == b'C' || base == b'c').count(); + let gc_count = readseq + .iter() + .filter(|&&base| base == b'G' || base == b'g' || base == b'C' || base == b'c') + .count(); (gc_count as f64) / (readseq.len() as f64) } @@ -302,7 +303,7 @@ fn test_filter() { input: None, mingc: 0.0, maxgc: 1.0, - }, + }, ); } @@ -344,7 +345,7 @@ fn test_filter_with_contam() { threads: 1, contam: Some("test-data/random_contam.fa".to_owned()), inverse: false, - input: None, + input: None, mingc: 0.0, maxgc: 1.0, }, diff --git a/src/utils.rs b/src/utils.rs index 5a90284..dcb1b9f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -92,4 +92,4 @@ where let fp = BufReader::new(io::stdin()); Ok(Box::new(fp)) } -} \ No newline at end of file +}