Skip to content

Commit

Permalink
seq kmer
Browse files Browse the repository at this point in the history
  • Loading branch information
dagou committed Jun 16, 2024
1 parent d073906 commit 16f465b
Show file tree
Hide file tree
Showing 19 changed files with 399 additions and 1,088 deletions.
14 changes: 1 addition & 13 deletions kr2r/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "kr2r"
version = "0.5.0"
version = "0.5.1"
edition = "2021"
authors = ["eric9n@gmail.com"]

Expand All @@ -11,16 +11,12 @@ name = "kun_peng"
path = "src/bin/kun.rs"

[features]
default = ["dna"]
dna = []
protein = []
double_hashing = []
exact_counting = []

[dependencies]
seqkmer = { version = "0.1.0", path = "../seqkmer" }
clap = { version = "4.4.10", features = ["derive"] }
seq_io = "0.3.2"
hyperloglogplus = { version = "*", features = ["const-loop"] }
seahash = "4.1.0"
serde = { version = "1.0", features = ["derive"] }
Expand All @@ -38,11 +34,3 @@ criterion = "0.5.1"
twox-hash = "1.6.3"
farmhash = {version = "1.1.5"}

[[bench]]
name = "mmscanner_benchmark"
harness = false


[[bench]]
name = "hash_benchmark"
harness = false
32 changes: 0 additions & 32 deletions kr2r/benches/hash_benchmark.rs

This file was deleted.

20 changes: 0 additions & 20 deletions kr2r/benches/mmscanner_benchmark.rs

This file was deleted.

10 changes: 6 additions & 4 deletions kr2r/src/args.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
// 使用时需要引用模块路径
use crate::utils::expand_spaced_seed_mask;
use crate::{construct_seed_template, parse_binary, Meros, BITS_PER_CHAR};
use crate::{
DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, DEFAULT_MINIMIZER_SPACES, DEFAULT_TOGGLE_MASK,
};
use crate::{construct_seed_template, parse_binary};
use clap::Parser;
use seqkmer::Meros;
use seqkmer::{
BITS_PER_CHAR, DEFAULT_KMER_LENGTH, DEFAULT_MINIMIZER_LENGTH, DEFAULT_MINIMIZER_SPACES,
DEFAULT_TOGGLE_MASK,
};
use std::path::PathBuf;

pub const U32MAXPLUS: u64 = u32::MAX as u64;
Expand Down
73 changes: 27 additions & 46 deletions kr2r/src/bin/classify.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use clap::Parser;
use kr2r::classify::{add_hitlist_string, count_values, resolve_tree, trim_pair_info};
use kr2r::classify::{adjust_hitlist_string, count_rows, resolve_tree, trim_pair_info};
use kr2r::compact_hash::{CHTable, Compact, HashConfig, Row};
use kr2r::readcounts::{TaxonCounters, TaxonCountersDash};
use kr2r::report::report_kraken_style;
Expand All @@ -8,7 +8,6 @@ use kr2r::utils::{
create_sample_file, detect_file_format, find_and_sort_files, get_lastest_file_index, FileFormat,
};
use kr2r::IndexOptions;
use seqkmer::seq::BaseType;
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufWriter, Write};
Expand All @@ -17,10 +16,10 @@ use std::path::PathBuf;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::Instant;

use seqkmer::parallel::read_parallel;
use seqkmer::reader::SeqMer;
use seqkmer::Meros;
use seqkmer::{reader::Reader, FastaReader, FastqPairReader, FastqReader};
use seqkmer::{
read_parallel, BaseType, FastaReader, FastqPairReader, FastqReader, HitGroup, Marker, Meros,
Reader, SeqMer,
};

#[derive(Parser, Debug, Clone)]
#[clap(
Expand Down Expand Up @@ -92,29 +91,24 @@ pub struct Args {
pub input_files: Vec<String>,
}

fn process_seq(
minimizer: &Vec<u64>,
hash_config: &HashConfig,
chtable: &CHTable,
offset: u32,
) -> Vec<Row> {
fn process_seq(marker: &Marker, hash_config: &HashConfig, chtable: &CHTable) -> HitGroup<Row> {
let chunk_size = hash_config.hash_capacity;
let value_bits = hash_config.value_bits;

let mut rows = Vec::new();
for (sort, &hash_key) in minimizer.iter().enumerate() {
for (sort, &hash_key) in marker.minimizer.iter().enumerate() {
let idx = hash_config.index(hash_key);
let partition_index = idx / chunk_size;
let index = idx % chunk_size;
let taxid = chtable.get_from_page(index, hash_key, partition_index + 1);
if taxid > 0 {
let compacted_key = hash_key.left(value_bits) as u32;
let high = u32::combined(compacted_key, taxid, value_bits);
let row = Row::new(high, 0, sort as u32 + 1 + offset);
let row = Row::new(high, 0, sort as u32 + 1);
rows.push(row);
}
}
rows
HitGroup::new(marker.size(), rows, 0)
}

fn process_record(
Expand All @@ -129,30 +123,14 @@ fn process_record(
) -> String {
let value_mask = hash_config.value_mask;

let seq_len_str = seq.fmt_size();
let (kmer_count1, kmer_count2, rows) = match &seq.marker {
BaseType::Single(marker) => (
marker.size(),
0,
process_seq(&marker.minimizer, &hash_config, chtable, 0),
),
BaseType::Pair((marker1, marker2)) => {
let mut rows = process_seq(&marker1.minimizer, &hash_config, chtable, 0);
let seq_len1 = marker1.size();
let rows2 = process_seq(&marker2.minimizer, &hash_config, chtable, seq_len1 as u32);
rows.extend_from_slice(&rows2);
(seq_len1, marker2.size(), rows)
}
};
let total_kmers = kmer_count1 + kmer_count2;
let (counts, cur_counts, hit_groups) = count_values(&rows, value_mask, kmer_count1 as u32);
let hit_string = add_hitlist_string(
&rows,
value_mask,
kmer_count1 as u32,
Some(kmer_count2 as u32),
taxonomy,
);
let seq_len_str = seq.fmt_cap();
let hits: BaseType<HitGroup<Row>> = seq
.marker
.apply(|marker| process_seq(&marker, &hash_config, chtable));

let total_kmers = seq.total_size();
let (counts, cur_counts, hit_groups) = count_rows(&hits, value_mask);
let hit_string = adjust_hitlist_string(&hits, value_mask, taxonomy);
let mut call = resolve_tree(&counts, taxonomy, total_kmers, args.confidence_threshold);
if call > 0 && hit_groups < args.minimum_hit_groups {
call = 0;
Expand Down Expand Up @@ -186,16 +164,19 @@ fn process_record(
output_line
}

fn process_fastx_file(
fn process_fastx_file<R>(
args: &Args,
meros: Meros,
hash_config: HashConfig,
file_index: usize,
reader: &mut Box<dyn Reader>,
reader: &mut R,
chtable: &CHTable,
taxonomy: &Taxonomy,
total_taxon_counts: &mut TaxonCounters,
) -> io::Result<(usize, usize)> {
) -> io::Result<(usize, usize)>
where
R: Reader,
{
let mut writer: Box<dyn Write + Send> = match &args.kraken_output_dir {
Some(ref file_path) => {
let filename = file_path.join(format!("output_{}.txt", file_index));
Expand All @@ -212,8 +193,8 @@ fn process_fastx_file(

let _ = read_parallel(
reader,
13,
15,
args.num_threads as usize - 2,
args.num_threads as usize,
meros,
|seqs| {
let mut buffer = String::new();
Expand All @@ -238,7 +219,7 @@ fn process_fastx_file(
Some(buffer)
},
|dataset| {
while let Ok(Some(res)) = dataset.next() {
while let Some(Some(res)) = dataset.next() {
writer
.write_all(res.as_bytes())
.expect("Failed to write date to file");
Expand Down Expand Up @@ -392,7 +373,7 @@ pub fn run(args: Args) -> Result<()> {
}
println!("start...");
let start = Instant::now();
let meros = idx_opts.as_smeros();
let meros = idx_opts.as_meros();
let hash_files = find_and_sort_files(&args.k2d_dir, "hash", ".k2d")?;
let chtable = CHTable::from_hash_files(hash_config, hash_files)?;

Expand Down
39 changes: 22 additions & 17 deletions kr2r/src/bin/estimate_capacity.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
use clap::{error::ErrorKind, Error, Parser};
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use kr2r::args::KLMTArgs;
use kr2r::mmscanner::MinimizerScanner;
use kr2r::utils::{find_library_fna_files, format_bytes, open_file};
use kr2r::KBuildHasher;
use seq_io::fasta::{Reader, Record};
use seq_io::parallel::read_parallel;

use seqkmer::{read_parallel, FastaReader};
use serde_json;
use std::collections::HashSet;
use std::fs::File;
Expand Down Expand Up @@ -84,33 +83,39 @@ fn process_sequence(
let mut hllp: HyperLogLogPlus<u64, _> =
HyperLogLogPlus::new(16, KBuildHasher::default()).unwrap();

let reader = Reader::from_path(fna_file).unwrap();
let mut reader = FastaReader::from_path(fna_file, 1).unwrap();
let range_n = args.n as u64;
read_parallel(
reader,
args.threads as u32,
args.threads - 2 as usize,
&mut reader,
args.threads,
args.threads - 2,
meros,
|record_set| {
let mut minimizer_set = HashSet::new();
for record in record_set.into_iter() {
let seq = record.seq();
let kmer_iter = MinimizerScanner::new(&seq, meros)
.into_iter()
.filter(|hash_key| hash_key & RANGE_MASK < range_n)
.collect::<HashSet<u64>>();

minimizer_set.extend(kmer_iter);
for record in record_set.into_iter() {
record
.marker
.fold(&mut minimizer_set, |minimizer_set, marker| {
let kmer_iter = marker
.minimizer
.iter()
.filter(|&hash_key| hash_key & RANGE_MASK < range_n);

minimizer_set.extend(kmer_iter);
});
}
minimizer_set
Some(minimizer_set)
},
|record_sets| {
while let Some(Ok((_, m_set))) = record_sets.next() {
while let Some(Some(m_set)) = record_sets.next() {
for minimizer in m_set {
hllp.insert(&minimizer);
}
}
},
);
)
.expect("read parallel error");

// 序列化 hllp 对象并将其写入文件
let serialized_hllp = serde_json::to_string(&hllp).unwrap();
Expand Down
Loading

0 comments on commit 16f465b

Please sign in to comment.