forked from databendlabs/databend
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
From the test, XOR8 is better FUSE8 for the applications, FUSE8 should check the duplicate keys(like hyperloglog) Test Result: fuse8: u64 bitmap encoding:1130544 bytes, raw:8000000 bytes, ratio:0.141318 fuse8: bool bitmap encoding:1130544 bytes, raw:1000000 bytes, ratio:1.130544 fuse8: string encoding:118832 bytes, raw:3000000 bytes, ratio:0.039610665 fuse8: same string encoding:118832 bytes, raw:3000000 bytes, ratio:0.039610665 xor8: u64 bitmap encoding:1230069 bytes, raw:8000000 bytes, ratio:0.15375863 xor8: bool bitmap encoding:61 bytes, raw:1000000 bytes, ratio:0.000061 xor8: string encoding:123067 bytes, raw:3000000 bytes, ratio:0.041022334 xor8: same string encoding:61 bytes, raw:3000000 bytes, ratio:0.000020333333
- Loading branch information
Showing
5 changed files
with
319 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
// Copyright 2022 Datafuse Labs. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
use std::hash::Hash; | ||
|
||
use cbordata::Cbor; | ||
use cbordata::FromCbor; | ||
use cbordata::IntoCbor; | ||
use common_exception::ErrorCode; | ||
use common_exception::Result; | ||
use xorfilter::Fuse8; | ||
|
||
use crate::bloom::Bloom; | ||
|
||
pub struct Fuse8Bloom { | ||
filter: Fuse8, | ||
} | ||
|
||
impl Fuse8Bloom { | ||
pub fn create(key_lens: u32) -> Self { | ||
Fuse8Bloom { | ||
filter: Fuse8::new(key_lens), | ||
} | ||
} | ||
} | ||
|
||
impl Bloom for Fuse8Bloom { | ||
fn len(&self) -> Result<usize> { | ||
match self.filter.len() { | ||
Some(n) => Ok(n), | ||
None => Err(ErrorCode::UnImplement("Fuse8 does not implement len()")), | ||
} | ||
} | ||
|
||
fn is_empty(&self) -> bool { | ||
match self.filter.len() { | ||
Some(n) => n == 0, | ||
None => true, | ||
} | ||
} | ||
|
||
fn add_key<K: ?Sized + Hash>(&mut self, key: &K) { | ||
self.filter.insert(key) | ||
} | ||
|
||
fn add_keys<K: Hash>(&mut self, keys: &[K]) { | ||
self.filter.populate(keys) | ||
} | ||
|
||
fn build(&mut self) -> Result<()> { | ||
self.filter | ||
.build() | ||
.map_err(|e| ErrorCode::UnexpectedError(format!("Fuse8.build error:{:?}", e))) | ||
} | ||
|
||
fn contains<K: ?Sized + Hash>(&self, key: &K) -> bool { | ||
self.filter.contains(key) | ||
} | ||
|
||
fn to_bytes(&self) -> Result<Vec<u8>> { | ||
let mut buf: Vec<u8> = vec![]; | ||
let cbor_val = self | ||
.filter | ||
.clone() | ||
.into_cbor() | ||
.map_err(|e| ErrorCode::UnexpectedError(format!("Fuse8.into_cbor error:{:}", e)))?; | ||
cbor_val | ||
.encode(&mut buf) | ||
.map_err(|e| ErrorCode::UnexpectedError(format!("Fuse8.encode error:{:}", e)))?; | ||
|
||
Ok(buf) | ||
} | ||
|
||
fn from_bytes(mut buf: &[u8]) -> Result<(Self, usize)> { | ||
let (cbor_val, n) = Cbor::decode(&mut buf) | ||
.map_err(|e| ErrorCode::UnexpectedError(format!("Fuse8.cbor.decode error:{:}", e)))?; | ||
|
||
let xor_value = Fuse8::from_cbor(cbor_val) | ||
.map_err(|e| ErrorCode::UnexpectedError(format!("Fuse8.from_cbor error:{:}", e)))?; | ||
Ok((Self { filter: xor_value }, n)) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,222 @@ | ||
// Copyright 2021 Datafuse Labs. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
use common_datablocks::DataBlock; | ||
use common_datavalues::prelude::*; | ||
use common_datavalues::DataField; | ||
use common_datavalues::DataSchemaRefExt; | ||
use common_datavalues::Series; | ||
use common_datavalues::SeriesFrom; | ||
use common_exception::Result; | ||
use common_storages_index::bloom::Bloom; | ||
use common_storages_index::bloom::Fuse8Bloom; | ||
use rand::prelude::random; | ||
use rand::rngs::StdRng; | ||
use rand::Rng; | ||
use rand::SeedableRng; | ||
|
||
#[test] | ||
fn test_xor_bitmap_u64() -> Result<()> { | ||
let seed: u64 = random(); | ||
let numbers = 1_000_000; | ||
|
||
let size = 8 * numbers; | ||
let mut rng = StdRng::seed_from_u64(seed); | ||
let keys: Vec<u64> = (0..numbers).map(|_| rng.gen::<u64>()).collect(); | ||
|
||
let mut filter = Fuse8Bloom::create(numbers as u32); | ||
filter.add_keys(&keys); | ||
filter.build()?; | ||
|
||
for key in keys.iter() { | ||
assert!(filter.contains(key), "key {} not present", key); | ||
} | ||
|
||
let val = filter.to_bytes()?; | ||
let (_, n) = Fuse8Bloom::from_bytes(&val)?; | ||
assert_eq!(n, val.len(), "{} {}", n, val.len()); | ||
|
||
// Lock the size. | ||
assert_eq!(n, 1130544); | ||
|
||
// fuse8: u64 bitmap encoding:1130544 bytes, raw:8000000 bytes, ratio:0.141318 | ||
println!( | ||
"fuse8: u64 bitmap encoding:{} bytes, raw:{} bytes, ratio:{}", | ||
val.len(), | ||
size, | ||
val.len() as f32 / size as f32 | ||
); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn test_xor_bitmap_bool() -> Result<()> { | ||
let seed: u64 = random(); | ||
let numbers = 1_000_000; | ||
|
||
let size = numbers; | ||
let mut rng = StdRng::seed_from_u64(seed); | ||
let keys: Vec<bool> = (0..numbers).map(|_| rng.gen::<u64>() % 2 == 0).collect(); | ||
|
||
let mut filter = Fuse8Bloom::create(numbers as u32); | ||
for key in keys.clone().into_iter() { | ||
filter.add_key(&key); | ||
} | ||
filter.build()?; | ||
|
||
for key in keys.iter() { | ||
assert!(filter.contains(key), "key {} not present", key); | ||
} | ||
|
||
let val = filter.to_bytes()?; | ||
let (_, n) = Fuse8Bloom::from_bytes(&val)?; | ||
assert_eq!(n, val.len(), "{} {}", n, val.len()); | ||
|
||
// Lock the size. | ||
assert_eq!(n, 1130544); | ||
|
||
// fuse8: bool bitmap encoding:1130544 bytes, raw:1000000 bytes, ratio:1.130544 | ||
println!( | ||
"fuse8: bool bitmap encoding:{} bytes, raw:{} bytes, ratio:{}", | ||
val.len(), | ||
size, | ||
val.len() as f32 / size as f32 | ||
); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn test_xor_bitmap_string() -> Result<()> { | ||
let seed: u64 = random(); | ||
let numbers = 100_000; | ||
|
||
let len = 30; | ||
let size = 30 * numbers; | ||
let mut rng = StdRng::seed_from_u64(seed); | ||
const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ\ | ||
abcdefghijklmnopqrstuvwxyz\ | ||
0123456789)(*&^%$#@!~"; | ||
|
||
let keys: Vec<String> = (0..numbers) | ||
.map(|_| { | ||
(0..len) | ||
.map(|_| { | ||
let idx = rng.gen_range(0..CHARSET.len()); | ||
CHARSET[idx] as char | ||
}) | ||
.collect() | ||
}) | ||
.collect(); | ||
|
||
let mut filter = Fuse8Bloom::create(numbers as u32); | ||
filter.add_keys(&keys); | ||
filter.build()?; | ||
|
||
for key in keys.iter() { | ||
assert!(filter.contains(key), "key {} not present", key); | ||
} | ||
|
||
let val = filter.to_bytes()?; | ||
let (_, n) = Fuse8Bloom::from_bytes(&val)?; | ||
assert_eq!(n, val.len(), "{} {}", n, val.len()); | ||
|
||
// Lock the size. | ||
assert_eq!(n, 118832); | ||
|
||
// fuse8: string encoding:118832 bytes, raw:3000000 bytes, ratio:0.039610665 | ||
println!( | ||
"fuse8: string encoding:{} bytes, raw:{} bytes, ratio:{}", | ||
val.len(), | ||
size, | ||
val.len() as f32 / size as f32 | ||
); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn test_xor_bitmap_duplicate_string() -> Result<()> { | ||
let numbers = 100_000; | ||
|
||
let key = "123456789012345678901234567890"; | ||
let len = key.len(); | ||
let size = len * numbers; | ||
|
||
let keys: Vec<String> = (0..numbers).map(|_| key.to_string()).collect(); | ||
|
||
let mut filter = Fuse8Bloom::create(numbers as u32); | ||
for key in keys.clone().into_iter() { | ||
filter.add_key(&key); | ||
} | ||
filter.build()?; | ||
|
||
assert!(filter.contains(&keys[0]), "key {} not present", key); | ||
|
||
let val = filter.to_bytes()?; | ||
let (_, n) = Fuse8Bloom::from_bytes(&val)?; | ||
assert_eq!(n, val.len(), "{} {}", n, val.len()); | ||
|
||
// Lock the size. | ||
assert_eq!(n, 118832); | ||
|
||
// fuse8: same string encoding:118832 bytes, raw:3000000 bytes, ratio:0.039610665 | ||
println!( | ||
"fuse8: same string encoding:{} bytes, raw:{} bytes, ratio:{}", | ||
val.len(), | ||
size, | ||
val.len() as f32 / size as f32 | ||
); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[test] | ||
fn test_xor_bitmap_data_block() -> Result<()> { | ||
let seed: u64 = random(); | ||
let numbers = 1_000_000; | ||
|
||
let size = 8 * numbers; | ||
let mut rng = StdRng::seed_from_u64(seed); | ||
let keys: Vec<i64> = (0..numbers).map(|_| rng.gen::<i64>()).collect(); | ||
|
||
let schema = DataSchemaRefExt::create(vec![DataField::new("a", i64::to_data_type())]); | ||
let block = DataBlock::create(schema, vec![Series::from_data(keys)]); | ||
let column = block.try_column_by_name("a")?; | ||
|
||
let mut filter = Fuse8Bloom::create(numbers as u32); | ||
filter.add_keys(&column.to_values()); | ||
filter.build()?; | ||
|
||
for key in column.to_values() { | ||
assert!(filter.contains(&key), "key {} not present", key); | ||
} | ||
|
||
let val = filter.to_bytes()?; | ||
let (_, n) = Fuse8Bloom::from_bytes(&val)?; | ||
assert_eq!(n, val.len(), "{} {}", n, val.len()); | ||
|
||
// data block(i64) enc:1130544, raw:8000000, ratio:0.141318 | ||
// Actually, it not related to datablock, it related to the type of the column | ||
// Here it same as u64. | ||
println!( | ||
"data block(i64) enc:{}, raw:{}, ratio:{}", | ||
val.len(), | ||
size, | ||
val.len() as f32 / size as f32 | ||
); | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters