Skip to content

Commit

Permalink
seq kmer
Browse files Browse the repository at this point in the history
  • Loading branch information
dagou committed Jun 15, 2024
1 parent 98807ba commit d073906
Show file tree
Hide file tree
Showing 10 changed files with 535 additions and 427 deletions.
386 changes: 77 additions & 309 deletions kr2r/src/bin/classify.rs

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions kr2r/src/kr2r_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use std::io::{Read, Result as IoResult, Write};
use std::mem;
use std::path::Path;

use seqkmer::Meros as SMeros;

pub fn parse_binary(src: &str) -> Result<u64, std::num::ParseIntError> {
u64::from_str_radix(src, 2)
}
Expand Down Expand Up @@ -180,4 +182,14 @@ impl IndexOptions {
u64_to_option(self.minimum_acceptable_hash_value),
)
}

pub fn as_smeros(&self) -> SMeros {
SMeros::new(
self.k,
self.l,
u64_to_option(self.spaced_seed_mask),
u64_to_option(self.toggle_mask),
u64_to_option(self.minimum_acceptable_hash_value),
)
}
}
1 change: 0 additions & 1 deletion seqkmer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ crossbeam-channel = "0.5"
scoped_threadpool = "0.1.9"
flate2 = "1.0"


[features]
default = ["dna"]
dna = []
Expand Down
106 changes: 106 additions & 0 deletions seqkmer/src/fasta.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
use crate::reader::{dyn_reader, trim_end, Reader, BUFSIZE};
use crate::seq::{BaseType, SeqFormat, Sequence};
use std::io::{BufRead, BufReader, Read, Result};
use std::path::Path;

/// FastaReader
pub struct FastaReader<R>
where
R: Read + Send,
{
reader: BufReader<R>,
file_index: usize,
reads_index: usize,
header: Vec<u8>,
seq: Vec<u8>,
}

impl<R> FastaReader<R>
where
R: Read + Send,
{
pub fn new(reader: R, file_index: usize) -> Self {
Self::with_capacity(reader, file_index, BUFSIZE)
}

pub fn with_capacity(reader: R, file_index: usize, capacity: usize) -> Self {
assert!(capacity >= 3);
Self {
reader: BufReader::with_capacity(capacity, reader),
file_index,
reads_index: 0,
header: Vec::new(),
seq: Vec::new(),
}
}

pub fn read_next(&mut self) -> Result<Option<()>> {
// 读取fastq文件header部分
self.header.clear();
if self.reader.read_until(b'\n', &mut self.header)? == 0 {
return Ok(None);
}
// 读取fasta文件seq部分
self.seq.clear();
if self.reader.read_until(b'>', &mut self.seq)? == 0 {
return Ok(None);
}
trim_end(&mut self.seq);
Ok(Some(()))
}
}

impl FastaReader<Box<dyn Read + Send>> {
#[inline]
pub fn from_path<P: AsRef<Path>>(path: P, file_index: usize) -> Result<Self> {
let reader = dyn_reader(path)?;
Ok(Self::new(reader, file_index))
}
}

fn check_sequence_length(seq: &Vec<u8>) -> bool {
let limit = u64::pow(2, 32);
// 检查seq的长度是否大于2的32次方
(seq.len() as u64) > limit
}

impl<R: Read + Send> Reader for FastaReader<R> {
fn next(&mut self) -> Result<Option<Vec<Sequence>>> {
if self.read_next()?.is_none() {
return Ok(None);
}

if check_sequence_length(&self.seq) {
eprintln!("Sequence length exceeds 2^32, which is not handled.");
return Ok(None);
}

let seq_id = unsafe {
let slice = if self.header.starts_with(b">") {
&self.header[1..]
} else {
&self.header[..]
};

let s = std::str::from_utf8_unchecked(slice);
let first_space_index = s
.as_bytes()
.iter()
.position(|&c| c == b' ')
.unwrap_or(s.len());

// 直接从原始切片创建第一个单词的切片
&s[..first_space_index]
};
self.reads_index += 1;

let sequence = Sequence {
file_index: self.file_index,
reads_index: self.reads_index,
id: seq_id.to_owned(),
seq: BaseType::Single(self.seq.to_owned()),
format: SeqFormat::Fasta,
};
Ok(Some(vec![sequence]))
}
}
Loading

0 comments on commit d073906

Please sign in to comment.