Skip to content

Commit

Permalink
perf: Faster cde rejection (#255)
Browse files Browse the repository at this point in the history
* Use the tempfile crate instead of the tempdir crate (which is deprecated)

https://github.com/rust-lang-deprecated/tempdir?tab=readme-ov-file#deprecation-note

* perf: Add benchmark that measures the rejection speed of a large non-zip file

* perf: Speed up non-zip rejection by increasing END_WINDOW_SIZE

I tested several END_WINDOW_SIZEs across 2 machines:

Machine 1: macOS 15.0.1, aarch64 (apfs /tmp)
512:   test parse_large_non_zip  ... bench:  30,450,608 ns/iter (+/- 673,910)
4096:  test parse_large_non_zip  ... bench:   7,741,366 ns/iter (+/- 521,101)
8192:  test parse_large_non_zip  ... bench:   5,807,443 ns/iter (+/- 546,227)
16384: test parse_large_non_zip  ... bench:   4,794,314 ns/iter (+/- 419,114)
32768: test parse_large_non_zip  ... bench:   4,262,897 ns/iter (+/- 397,582)
65536: test parse_large_non_zip  ... bench:   4,060,847 ns/iter (+/- 280,964)

Machine 2: Debian testing, x86_64 (tmpfs /tmp)
512:   test parse_large_non_zip  ... bench:  65,132,581 ns/iter (+/- 7,429,976)
4096:  test parse_large_non_zip  ... bench:  14,109,503 ns/iter (+/- 2,892,086)
8192:  test parse_large_non_zip  ... bench:   9,942,500 ns/iter (+/- 1,886,063)
16384: test parse_large_non_zip  ... bench:   8,205,851 ns/iter (+/- 2,902,041)
32768: test parse_large_non_zip  ... bench:   7,012,011 ns/iter (+/- 2,222,879)
65536: test parse_large_non_zip  ... bench:   6,577,275 ns/iter (+/- 881,546)

In both cases END_WINDOW_SIZE=8192 performed about 6x better than 512 and >8192
didn't make much of a difference on top of that.

* perf: Speed up non-zip rejection by limiting search for EOCDR.

I benchmarked several search sizes across 2 machines
(these benches are using an 8192 END_WINDOW_SIZE):

Machine 1: macOS 15.0.1, aarch64 (apfs /tmp)
whole file:   test parse_large_non_zip              ... bench:   5,773,801 ns/iter (+/- 411,277)
last 128k:    test parse_large_non_zip              ... bench:      54,402 ns/iter (+/- 4,126)
last 66,000:  test parse_large_non_zip              ... bench:      36,152 ns/iter (+/- 4,293)

Machine 2: Debian testing, x86_64 (tmpfs /tmp)
whole file:   test parse_large_non_zip              ... bench:   9,942,306 ns/iter (+/- 1,963,522)
last 128k:    test parse_large_non_zip              ... bench:      73,604 ns/iter (+/- 16,662)
last 66,000:  test parse_large_non_zip              ... bench:      41,349 ns/iter (+/- 16,812)

As you might expect these significantly increase the rejection speed for
large non-zip files.

66,000 was the number previously used by zip-rs. It was changed to zero in
7a55945.

128K is what Info-Zip uses[1]. This seems like a reasonable (non-zero)
choice for compatibility reasons.

[1] Info-zip is extremely old and doesn't not have an official git repo to
    link to. However, an unofficial fork can be found here:
    https://github.com/hiirotsuki/infozip/blob/bb0c4755d44f21bda0744a5e1868d25055a543cc/zipfile.c#L4073

---------

Co-authored-by: Chris Hennick <4961925+Pr0methean@users.noreply.github.com>
  • Loading branch information
caldwell and Pr0methean authored Nov 19, 2024
1 parent b8257f8 commit 73143a0
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 12 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ walkdir = "2.5.0"
time = { workspace = true, features = ["formatting", "macros"] }
anyhow = "1"
clap = { version = "=4.4.18", features = ["derive"] }
tempdir = "0.3.7"
tempfile = "3"

[features]
aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
Expand Down
20 changes: 18 additions & 2 deletions benches/read_metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::io::{self, prelude::*, Cursor};

use bencher::Bencher;
use getrandom::getrandom;
use tempdir::TempDir;
use tempfile::TempDir;
use zip::write::SimpleFileOptions;
use zip::{result::ZipResult, CompressionMethod, ZipArchive, ZipWriter};

Expand Down Expand Up @@ -102,7 +102,7 @@ fn parse_stream_archive(bench: &mut Bencher) {
let bytes = generate_random_archive(STREAM_ZIP_ENTRIES, STREAM_FILE_SIZE).unwrap();

/* Write to a temporary file path to incur some filesystem overhead from repeated reads */
let dir = TempDir::new("stream-bench").unwrap();
let dir = TempDir::with_prefix("stream-bench").unwrap();
let out = dir.path().join("bench-out.zip");
fs::write(&out, &bytes).unwrap();

Expand All @@ -116,11 +116,27 @@ fn parse_stream_archive(bench: &mut Bencher) {
bench.bytes = bytes.len() as u64;
}

fn parse_large_non_zip(bench: &mut Bencher) {
const FILE_SIZE: usize = 17_000_000;

// Create a large file that doesn't have a zip header (generating random data _might_ make a zip magic
// number somewhere which is _not_ what we're trying to test).
let dir = TempDir::with_prefix("large-non-zip-bench").unwrap();
let file = dir.path().join("zeros");
let buf = vec![0u8; FILE_SIZE];
fs::write(&file, &buf).unwrap();

bench.iter(|| {
assert!(zip::ZipArchive::new(std::fs::File::open(&file).unwrap()).is_err());
})
}

benchmark_group!(
benches,
read_metadata,
parse_archive_with_comment,
parse_zip64_archive_with_comment,
parse_stream_archive,
parse_large_non_zip,
);
benchmark_main!(benches);
4 changes: 2 additions & 2 deletions src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1777,7 +1777,7 @@ mod test {
use crate::CompressionMethod::Stored;
use crate::{ZipArchive, ZipWriter};
use std::io::{Cursor, Read, Write};
use tempdir::TempDir;
use tempfile::TempDir;

#[test]
fn invalid_offset() {
Expand Down Expand Up @@ -1979,7 +1979,7 @@ mod test {
v.extend_from_slice(include_bytes!("../tests/data/symlink.zip"));
let mut reader = ZipArchive::new(Cursor::new(v)).unwrap();
assert!(reader.by_index(0).unwrap().is_symlink());
let tempdir = TempDir::new("test_is_symlink")?;
let tempdir = TempDir::with_prefix("test_is_symlink")?;
reader.extract(&tempdir).unwrap();
assert!(tempdir.path().join("bar").is_symlink());
Ok(())
Expand Down
13 changes: 10 additions & 3 deletions src/spec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -353,9 +353,16 @@ impl Zip32CentralDirectoryEnd {
return Err(ZipError::InvalidArchive("Invalid zip header"));
}

let search_lower_bound = 0;

const END_WINDOW_SIZE: usize = 512;
// The End Of Central Directory Record should be the last thing in
// the file and so searching the last 65557 bytes of the file should
// be enough. However, not all zips are well-formed and other
// programs may consume zips with extra junk at the end without
// error, so we go back 128K to be compatible with them. 128K is
// arbitrary, but it matches what Info-Zip does.
const EOCDR_SEARCH_SIZE: u64 = 128 * 1024;
let search_lower_bound = file_length.saturating_sub(EOCDR_SEARCH_SIZE);

const END_WINDOW_SIZE: usize = 8192;
/* TODO: use static_assertions!() */
debug_assert!(END_WINDOW_SIZE > mem::size_of::<Magic>());

Expand Down
4 changes: 2 additions & 2 deletions tests/extract_symlink.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
#[cfg(all(unix, feature = "_deflate-any"))]
fn extract_should_respect_links() {
use std::{fs, io, path::PathBuf, str::FromStr};
use tempdir::TempDir;
use tempfile::TempDir;
use zip::ZipArchive;

let mut v = Vec::new();
v.extend_from_slice(include_bytes!("data/pandoc_soft_links.zip"));
let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file");
let temp_dir = TempDir::new("pandoc_soft_links").unwrap();
let temp_dir = TempDir::with_prefix("pandoc_soft_links").unwrap();
archive.extract(&temp_dir).unwrap();

let symlink_path = temp_dir.path().join("pandoc-3.2-arm64/bin/pandoc-lua");
Expand Down
4 changes: 2 additions & 2 deletions tests/repro_old423.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
#[test]
fn repro_old423() -> zip::result::ZipResult<()> {
use std::io;
use tempdir::TempDir;
use tempfile::TempDir;
use zip::ZipArchive;

let mut v = Vec::new();
v.extend_from_slice(include_bytes!("data/lin-ub_iwd-v11.zip"));
let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file");
archive.extract(TempDir::new("repro_old423")?)
archive.extract(TempDir::with_prefix("repro_old423")?)
}

0 comments on commit 73143a0

Please sign in to comment.