Skip to content

Commit

Permalink
pipelined extraction
Browse files Browse the repository at this point in the history
- initial sketch of lexicographic trie for pipelining
- move path splitting into a submodule
- lex trie can now propagate entry data
- outline handle allocation
- mostly handle files
- mostly handle dirs
- clarify symlink FIXMEs
- do symlink validation
- extract writable dir setting to helper method
- modify args to handle allocation method
- handle allocation test passes
- simplify perms a lot
- outline evaluation
- handle symlinks
- BIGGER CHANGE! add EntryReader/etc
- make initial pipelined extract work
- fix file perms by writing them after finishing the file write
- support directory entries by unix mode as well
- impl split extraction
- remove dependency on reader refactoring
  • Loading branch information
cosmicexplorer committed Jul 16, 2024
1 parent 8656826 commit 7fbe408
Show file tree
Hide file tree
Showing 7 changed files with 2,496 additions and 0 deletions.
9 changes: 9 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ lzma-rs = { version = "0.3.0", default-features = false, optional = true }
[target.'cfg(any(all(target_arch = "arm", target_pointer_width = "32"), target_arch = "mips", target_arch = "powerpc"))'.dependencies]
crossbeam-utils = "0.8.20"

[target.'cfg(unix)'.dependencies]
libc = { version = "0.2.155", optional = true }

[target.'cfg(fuzzing)'.dependencies]
arbitrary = { version = "1.3.2", features = ["derive"] }

Expand All @@ -63,6 +66,7 @@ time = { workspace = true, features = ["formatting", "macros"] }
anyhow = "1"
clap = { version = "=4.4.18", features = ["derive"] }
tempdir = "0.3.7"
tempfile = "3.10.1"

[features]
aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
Expand All @@ -79,6 +83,7 @@ deflate-zopfli = ["zopfli", "_deflate-any"]
lzma = ["lzma-rs/stream"]
unreserved = []
xz = ["lzma-rs/raw_decoder"]
parallelism = ["libc"]
default = [
"aes-crypto",
"bzip2",
Expand All @@ -101,3 +106,7 @@ harness = false
[[bench]]
name = "merge_archive"
harness = false

[[bench]]
name = "extract"
harness = false
86 changes: 86 additions & 0 deletions benches/extract.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use bencher::{benchmark_group, benchmark_main};

use bencher::Bencher;
use tempdir::TempDir;

use std::fs;
use std::path::Path;

use zip::result::ZipResult;
use zip::ZipArchive;

#[cfg(all(feature = "parallelism", unix))]
use zip::read::{split_extract, ExtractionParameters};

/* This archive has a set of entries repeated 20x:
* - 200K random data, stored uncompressed (CompressionMethod::Stored)
* - 246K text data (the project gutenberg html version of king lear)
* (CompressionMethod::Bzip2, compression level 1) (project gutenberg ebooks are public domain)
*
* The full archive file is 5.3MB.
*/
fn get_test_archive() -> ZipResult<ZipArchive<fs::File>> {
let path =
Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/stored-and-compressed-text.zip");
let file = fs::File::open(path)?;
ZipArchive::new(file)
}

fn extract_basic(bench: &mut Bencher) {
let mut readable_archive = get_test_archive().unwrap();
let total_size: u64 = readable_archive
.decompressed_size()
.unwrap()
.try_into()
.unwrap();

let parent = TempDir::new("zip-extract").unwrap();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(move || {
let outdir = TempDir::new_in(parent.path(), "bench-subdir")
.unwrap()
.into_path();
readable_archive.extract(outdir).unwrap();
});
});
}

#[cfg(all(feature = "parallelism", unix))]
const DECOMPRESSION_THREADS: usize = 8;

#[cfg(all(feature = "parallelism", unix))]
fn extract_split(bench: &mut Bencher) {
let readable_archive = get_test_archive().unwrap();
let total_size: u64 = readable_archive
.decompressed_size()
.unwrap()
.try_into()
.unwrap();

let params = ExtractionParameters {
decompression_threads: DECOMPRESSION_THREADS,
..Default::default()
};

let parent = TempDir::new("zip-extract").unwrap();

bench.bytes = total_size;
bench.bench_n(1, |bench| {
bench.iter(move || {
let outdir = TempDir::new_in(parent.path(), "bench-subdir")
.unwrap()
.into_path();
split_extract(&readable_archive, &outdir, params.clone()).unwrap();
});
});
}

#[cfg(not(all(feature = "parallelism", unix)))]
benchmark_group!(benches, extract_basic);

#[cfg(all(feature = "parallelism", unix))]
benchmark_group!(benches, extract_basic, extract_split);

benchmark_main!(benches);
10 changes: 10 additions & 0 deletions src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ pub(crate) mod lzma;
#[cfg(feature = "xz")]
pub(crate) mod xz;

#[cfg(feature = "parallelism")]
pub(crate) mod pipelining;
#[cfg(all(unix, feature = "parallelism"))]
pub use pipelining::split_extraction::{split_extract, ExtractionParameters, SplitExtractionError};
#[cfg(feature = "parallelism")]
pub(crate) mod split;

// Put the struct declaration in a private module to convince rustdoc to display ZipArchive nicely
pub(crate) mod zip_archive {
use indexmap::IndexMap;
Expand Down Expand Up @@ -1076,6 +1083,9 @@ impl<R: Read + Seek> ZipArchive<R> {

fn make_writable_dir_all<T: AsRef<Path>>(outpath: T) -> Result<(), ZipError> {
create_dir_all(outpath.as_ref())?;
/* TODO: do we want to automatically make the directory writable? Wouldn't we prefer to
* respect the write permissions of the extraction dir? Pipelined extraction does not
* mutate permissions like this. */
#[cfg(unix)]
{
// Dirs must be writable until all normal files are extracted
Expand Down
Loading

0 comments on commit 7fbe408

Please sign in to comment.