Skip to content

Commit

Permalink
encoding now actually streams the data instead of reading the source …
Browse files Browse the repository at this point in the history
…until the end before processing
  • Loading branch information
KillingSpark committed Nov 28, 2024
1 parent 9011b9d commit 5130047
Show file tree
Hide file tree
Showing 8 changed files with 297 additions and 147 deletions.
18 changes: 17 additions & 1 deletion src/bin/zstd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use ruzstd::encoding::CompressionLevel;
use ruzstd::encoding::FrameCompressor;
use ruzstd::frame::ReadFrameHeaderError;
use ruzstd::frame_decoder::FrameDecoderError;
use ruzstd::FrameDecoder;

struct StateTracker {
bytes_used: u64,
Expand Down Expand Up @@ -144,8 +145,23 @@ fn main() {
"Compressed {path:} from {} to {} ({}%)",
input_len,
output.len(),
output.len() * 100 / input_len
if input_len == 0 {
0
} else {
output.len() * 100 / input_len
}
);
let mut dec = FrameDecoder::new();
let mut decomp = Vec::new();
decomp.reserve(input_len + 10000);
dec.decode_all_to_vec(&output, &mut decomp).unwrap();

let mut original = Vec::new();
std::fs::File::open(&path)
.unwrap()
.read_to_end(&mut original)
.unwrap();
assert_eq!(original, decomp);
}
} else {
decompress(&flags, &file_paths);
Expand Down
20 changes: 8 additions & 12 deletions src/encoding/blocks/compressed.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
use std::eprintln;

use alloc::vec::Vec;

use crate::{
encoding::{
bit_writer::BitWriter,
match_generator::{MatchGenerator, Sequence},
},
encoding::{bit_writer::BitWriter, match_generator::Sequence, Matcher},
fse::fse_encoder::{default_ll_table, default_ml_table, default_of_table, FSETable, State},
huff0::huff0_encoder,
};

pub fn compress_block<'a>(matcher: &mut MatchGenerator<'a>, data: &'a [u8], output: &mut Vec<u8>) {
matcher.add_data(data);
pub fn compress_block<M: Matcher>(matcher: &mut M, len: usize, output: &mut Vec<u8>) {
let mut literals_vec = Vec::new();
let mut sequences = Vec::new();
loop {
let Some(seq) = matcher.next_sequence() else {
break;
};

matcher.start_matching(len, |seq| {
match seq {
Sequence::Literals { literals } => literals_vec.extend_from_slice(literals),
Sequence::Triple {
Expand All @@ -33,7 +27,9 @@ pub fn compress_block<'a>(matcher: &mut MatchGenerator<'a>, data: &'a [u8], outp
});
}
}
}
});

eprintln!("Seqs: {}", sequences.len());

// literals section

Expand Down
2 changes: 0 additions & 2 deletions src/encoding/blocks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,5 @@
//! There are a few different kinds of blocks, and implementations for those kinds are
//! in this module.
mod compressed;
mod raw;

pub(super) use compressed::*;
pub(super) use raw::*;
18 changes: 0 additions & 18 deletions src/encoding/blocks/raw.rs

This file was deleted.

122 changes: 57 additions & 65 deletions src/encoding/frame_encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@ use alloc::vec::Vec;
use core::convert::TryInto;

use super::{
block_header::BlockHeader,
blocks::{compress_block, compress_raw_block},
frame_header::FrameHeader,
match_generator::MatchGenerator,
block_header::BlockHeader, blocks::compress_block, frame_header::FrameHeader,
match_generator::MatchGeneratorDriver, Matcher,
};

use crate::io::{Read, Write};
Expand Down Expand Up @@ -91,84 +89,77 @@ impl<R: Read, W: Write> FrameCompressor<R, W> {
};
header.serialize(output);

// TODO dont read input completely into memory here, work on a window of input
let mut uncompressed_data = Vec::new();
self.uncompressed_data
.read_to_end(&mut uncompressed_data)
.unwrap();
let uncompressed_data = uncompressed_data;
let mut matcher = MatchGenerator::new(1024 * 128);

// Special handling is needed for compression of a totally empty file (why you'd want to do that, I don't know)
if uncompressed_data.is_empty() {
let header = BlockHeader {
last_block: true,
block_type: crate::blocks::block::BlockType::Raw,
block_size: 0,
};
// Write the header, then the block
header.serialize(output);
}
let mut matcher = MatchGeneratorDriver::new(1024 * 128, 1024 * 128);
loop {
let uncompressed_data = matcher.get_next_space();
let mut read_bytes = 0;
let last_block;
'read_loop: loop {
let new_bytes = self
.uncompressed_data
.read(&mut uncompressed_data[read_bytes..])
.unwrap();
if new_bytes == 0 {
last_block = true;
break 'read_loop;
}
read_bytes += new_bytes;
if read_bytes == uncompressed_data.len() {
last_block = false;
break 'read_loop;
}
}
let uncompressed_data = &uncompressed_data[..read_bytes];

// Special handling is needed for compression of a totally empty file (why you'd want to do that, I don't know)
if uncompressed_data.is_empty() {
let header = BlockHeader {
last_block: true,
block_type: crate::blocks::block::BlockType::Raw,
block_size: 0,
};
// Write the header, then the block
header.serialize(output);
self.compressed_data.write_all(output).unwrap();
output.clear();
break;
}

match self.compression_level {
CompressionLevel::Uncompressed => {
// Blocks are compressed by writing a header, then writing
// the block in repetition until the last block is reached.
let mut index = 0;
while index < uncompressed_data.len() {
let last_block = index + MAX_BLOCK_SIZE >= uncompressed_data.len();
// We read till the end of the data, or till the max block size, whichever comes sooner
let block_size = if last_block {
uncompressed_data.len() - index
} else {
MAX_BLOCK_SIZE
};
match self.compression_level {
CompressionLevel::Uncompressed => {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Raw,
block_size: block_size.try_into().unwrap(),
block_size: read_bytes.try_into().unwrap(),
};
// Write the header, then the block
header.serialize(output);
compress_raw_block(&uncompressed_data[index..(index + block_size)], output);
index += block_size;
output.extend_from_slice(uncompressed_data);
}
}
CompressionLevel::Fastest => {
let mut index = 0;
while index < uncompressed_data.len() {
let last_block = index + MAX_BLOCK_SIZE >= uncompressed_data.len();
// We read till the end of the data, or till the max block size, whichever comes sooner
let block_size = if last_block {
uncompressed_data.len() - index
} else {
MAX_BLOCK_SIZE
};

let uncompressed = &uncompressed_data[index..(index + block_size)];

if uncompressed.iter().all(|x| uncompressed[0].eq(x)) {
matcher.add_data_no_matching(uncompressed);
CompressionLevel::Fastest => {
if uncompressed_data.iter().all(|x| uncompressed_data[0].eq(x)) {
let rle_byte = uncompressed_data[0];
matcher.commit_space(read_bytes);
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::RLE,
block_size: uncompressed.len().try_into().unwrap(),
block_size: read_bytes.try_into().unwrap(),
};
// Write the header, then the block
header.serialize(output);
output.push(uncompressed[0]);
output.push(rle_byte);
} else {
let mut compressed = Vec::new();
compress_block(&mut matcher, uncompressed, &mut compressed);
compress_block(&mut matcher, read_bytes, &mut compressed);
if compressed.len() >= MAX_BLOCK_SIZE {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Raw,
block_size: block_size.try_into().unwrap(),
block_size: read_bytes.try_into().unwrap(),
};
// Write the header, then the block
header.serialize(output);
compress_raw_block(uncompressed, output);
output.extend_from_slice(matcher.get_last_space());
} else {
let header = BlockHeader {
last_block,
Expand All @@ -180,16 +171,17 @@ impl<R: Read, W: Write> FrameCompressor<R, W> {
output.extend(compressed);
}
}
index += block_size;
self.compressed_data.write_all(output).unwrap();
output.clear();
}
_ => {
unimplemented!();
}
}
_ => {
unimplemented!();
self.compressed_data.write_all(output).unwrap();
output.clear();
if last_block {
break;
}
}
self.compressed_data.write_all(output).unwrap();
}
}

Expand Down
Loading

0 comments on commit 5130047

Please sign in to comment.