Skip to content

Commit

Permalink
cleanup + move bench using criterion
Browse files Browse the repository at this point in the history
  • Loading branch information
appaquet committed Mar 17, 2024
1 parent 206bf7e commit f6c9cf4
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 93 deletions.
6 changes: 4 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## [0.7.0] - 2024-02-25

- Breaking: increase value size to 24bits (from 16bits), which means that the
maximum value size is now 16MB (from 64KB).
maximum value size is now 16MB (from 64KB). This required the index version
to be bumped, leading to backward incompatibility.
- Fix: delete tmp directory when external sorted is used.

## [0.6.0] - 2024-02-19

- Potentially breaking: support for empty index instead of failing if the index is empty.
- Potentially breaking: support for empty index instead of failing if the index
is empty.

## [0.5.0] - 2022-08-02

Expand Down
6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ skeptic = "0.13"

[[bench]]
harness = false
name = "extindex"
name = "builder"

[[bench]]
harness = false
name = "reader"

[profile.bench]
debug = true
32 changes: 17 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
extindex
# extindex

[![crates.io](https://img.shields.io/crates/v/extindex.svg)](https://crates.io/crates/extindex)
=========

Immutable persisted index (on disk) that can be built in one pass using a sorted iterator, or can
use [extsort](https://crates.io/crates/extsort) to externally sort the iterator first, and
then build the index from it.
Immutable persisted index (on disk) that can be built in one pass using a sorted
iterator, or can use [extsort](https://crates.io/crates/extsort) to externally
sort the iterator first, and then build the index from it.

The index allows random lookups and sorted scans. An indexed entry consists of a key and a value.
The key needs to implement `Eq` and `Ord`, and both the key and values need to implement a
`Serializable` trait for serialization to and from disk.
The index allows random lookups and sorted scans. An indexed entry consists of a
key and a value. The key needs to implement `Eq` and `Ord`, and both the key
and values need to implement a `Serializable` trait for serialization to and
from disk. It is possible to rely on the [`serde`](https://crates.io/crates/serde)
library to implement this trait for most types.

The index is built using a skip list like data structure, but in which lookups are starting from
the end of the index instead of from the beginning. This allow building the index in a single
pass on a sorted iterator, since starting from the beginning would require knowing
checkpoints/nodes ahead in the file.
The index is built using a skip list-like data structure, but lookups start from
the end of the index instead of the beginning. This allows building the index in
a single pass on a sorted iterator, as starting from the beginning would require
knowing checkpoints/nodes ahead in the file.

# Example <!-- keep in sync with serde_struct.rs -->
## Example

```rust
extern crate extindex;
Expand Down Expand Up @@ -48,6 +50,6 @@ fn main() {
}
```

# TODO
## Roadmap

- [ ] Possibility to use Bloom filter to prevent hitting the disk when index doesn't have a key
- Possibility to use a Bloom filter to avoid disk access when the index does not contain a key.
128 changes: 128 additions & 0 deletions benches/builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Copyright 2018 Andre-Philippe Paquet
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};

use std::{
io::{Read, Write},
time::Duration,
};

use extindex::{Builder, Entry, Serializable};

fn bench_builder(c: &mut Criterion) {
let mut group = c.benchmark_group("builder");
group.sample_size(10);
group.measurement_time(Duration::from_secs(9));
group.sampling_mode(criterion::SamplingMode::Flat);
group.warm_up_time(Duration::from_millis(100));

let sizes = [10_000, 100_000, 1_000_000];
for size in sizes {
group.bench_with_input(BenchmarkId::new("known size", size), &size, |b, size| {
b.iter(|| {
let index_file = tempfile::NamedTempFile::new().unwrap();
let index_file = index_file.path();

let builder = Builder::new(index_file);
builder.build(create_known_size_entries(*size)).unwrap();
});
});

group.bench_with_input(BenchmarkId::new("unknown size", size), &size, |b, size| {
b.iter(|| {
let index_file = tempfile::NamedTempFile::new().unwrap();
let index_file = index_file.path();

let builder = Builder::new(index_file);
builder.build(create_unknown_size_entries(*size)).unwrap();
});
});
}
}

fn create_known_size_entries(
nb_entries: usize,
) -> impl Iterator<Item = Entry<SizedString, SizedString>> {
(0..nb_entries).map(|idx| {
Entry::new(
SizedString(format!("key:{}", idx)),
SizedString(format!("val:{}", idx)),
)
})
}

fn create_unknown_size_entries(
nb_entries: usize,
) -> impl Iterator<Item = Entry<UnsizedString, UnsizedString>> {
(0..nb_entries).map(|idx| {
Entry::new(
UnsizedString(format!("key:{}", idx)),
UnsizedString(format!("val:{}", idx)),
)
})
}

#[derive(Ord, PartialOrd, Eq, PartialEq, Debug)]
struct SizedString(String);

impl Serializable for SizedString {
fn size(&self) -> Option<usize> {
Some(self.0.as_bytes().len())
}

fn serialize<W: Write>(&self, write: &mut W) -> Result<(), std::io::Error> {
write.write_all(self.0.as_bytes()).map(|_| ())
}

fn deserialize<R: Read>(data: &mut R, size: usize) -> Result<SizedString, std::io::Error> {
let mut bytes = vec![0u8; size];
data.read_exact(&mut bytes)?;
Ok(SizedString(String::from_utf8_lossy(&bytes).to_string()))
}
}

impl std::fmt::Display for SizedString {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}

#[derive(Ord, PartialOrd, Eq, PartialEq, Debug)]
pub struct UnsizedString(pub String);

impl Serializable for UnsizedString {
fn size(&self) -> Option<usize> {
None
}

fn serialize<W: Write>(&self, write: &mut W) -> Result<(), std::io::Error> {
write.write_all(self.0.as_bytes()).map(|_| ())
}

fn deserialize<R: Read>(data: &mut R, size: usize) -> Result<UnsizedString, std::io::Error> {
let mut bytes = vec![0u8; size];
data.read_exact(&mut bytes)?;
Ok(UnsizedString(String::from_utf8_lossy(&bytes).to_string()))
}
}

impl std::fmt::Display for UnsizedString {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}

criterion_group!(benches, bench_builder,);
criterion_main!(benches);
78 changes: 3 additions & 75 deletions benches/extindex.rs → benches/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,39 +21,8 @@ use std::{

use extindex::{Builder, Entry, Reader, Serializable};

fn bench_index_builder(c: &mut Criterion) {
let mut group = c.benchmark_group("Builder");
group.sample_size(10);
group.measurement_time(Duration::from_secs(9));
group.sampling_mode(criterion::SamplingMode::Flat);
group.warm_up_time(Duration::from_millis(100));

let sizes = [10_000, 100_000, 1_000_000];
for size in sizes {
group.bench_with_input(BenchmarkId::new("known size", size), &size, |b, size| {
b.iter(|| {
let index_file = tempfile::NamedTempFile::new().unwrap();
let index_file = index_file.path();

let builder = Builder::new(index_file);
builder.build(create_known_size_entries(*size)).unwrap();
});
});

group.bench_with_input(BenchmarkId::new("unknown size", size), &size, |b, size| {
b.iter(|| {
let index_file = tempfile::NamedTempFile::new().unwrap();
let index_file = index_file.path();

let builder = Builder::new(index_file);
builder.build(create_unknown_size_entries(*size)).unwrap();
});
});
}
}

fn bench_random_access(c: &mut Criterion) {
let mut group = c.benchmark_group("RandomAccess1million");
let mut group = c.benchmark_group("random_access");
group.sample_size(10);
group.measurement_time(Duration::from_secs(7));
group.sampling_mode(criterion::SamplingMode::Flat);
Expand Down Expand Up @@ -86,7 +55,7 @@ fn bench_random_access(c: &mut Criterion) {
}

fn bench_iter(c: &mut Criterion) {
let mut group = c.benchmark_group("Iter1million");
let mut group = c.benchmark_group("iter_1million");
group.sample_size(10);
group.measurement_time(Duration::from_secs(7));
group.sampling_mode(criterion::SamplingMode::Flat);
Expand Down Expand Up @@ -139,17 +108,6 @@ fn bench_iter(c: &mut Criterion) {
});
}

fn create_known_size_entries(
nb_entries: usize,
) -> impl Iterator<Item = Entry<SizedString, SizedString>> {
(0..nb_entries).map(|idx| {
Entry::new(
SizedString(format!("key:{}", idx)),
SizedString(format!("val:{}", idx)),
)
})
}

fn create_unknown_size_entries(
nb_entries: usize,
) -> impl Iterator<Item = Entry<UnsizedString, UnsizedString>> {
Expand All @@ -161,31 +119,6 @@ fn create_unknown_size_entries(
})
}

#[derive(Ord, PartialOrd, Eq, PartialEq, Debug)]
struct SizedString(String);

impl Serializable for SizedString {
fn size(&self) -> Option<usize> {
Some(self.0.as_bytes().len())
}

fn serialize<W: Write>(&self, write: &mut W) -> Result<(), std::io::Error> {
write.write_all(self.0.as_bytes()).map(|_| ())
}

fn deserialize<R: Read>(data: &mut R, size: usize) -> Result<SizedString, std::io::Error> {
let mut bytes = vec![0u8; size];
data.read_exact(&mut bytes)?;
Ok(SizedString(String::from_utf8_lossy(&bytes).to_string()))
}
}

impl std::fmt::Display for SizedString {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}

#[derive(Ord, PartialOrd, Eq, PartialEq, Debug)]
pub struct UnsizedString(pub String);

Expand All @@ -211,10 +144,5 @@ impl std::fmt::Display for UnsizedString {
}
}

criterion_group!(
benches,
bench_index_builder,
bench_random_access,
bench_iter
);
criterion_group!(benches, bench_random_access, bench_iter);
criterion_main!(benches);

0 comments on commit f6c9cf4

Please sign in to comment.