Skip to content

Commit

Permalink
Merge branch 'Vagabond-adt/forgetful-bloom'
Browse files Browse the repository at this point in the history
  • Loading branch information
Elzor committed May 17, 2020
2 parents eb78f6a + cdbde89 commit b51e77b
Show file tree
Hide file tree
Showing 4 changed files with 230 additions and 2 deletions.
136 changes: 136 additions & 0 deletions crates/bloom/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,41 @@ struct FilterResource {
filter: RwLock<Bloom<[u8]>>,
}

struct ForgetfulFilter {
filter: Vec<Bloom<[u8]>>,
rotate_at: usize,
insertion_count: usize
}

struct ForgetfulFilterResource {
filter: RwLock<ForgetfulFilter>,
}


rustler_export_nifs!(
"bloom",
[
("new", 2, new),
("new_for_fp_rate", 2, new_for_fp_rate),
("new_forgetful", 4, new_forgetful),
("new_forgetful_for_fp_rate", 4, new_forgetful_for_fp_rate),
("serialize", 1, serialize),
("deserialize", 7, deserialize),
("set", 2, set),
("set_forgetful", 2, set_forgetful),
("check_nif", 2, check),
("check_nif", 8, check_ro),
("check_forgetful", 2, check_forgetful),
("check_and_set", 2, check_and_set),
("clear", 1, clear),
("clear_forgetful", 1, clear_forgetful),
],
Some(on_load)
);

fn on_load<'a>(env: Env<'a>, _load_info: Term<'a>) -> bool {
resource_struct_init!(FilterResource, env);
resource_struct_init!(ForgetfulFilterResource, env);
true
}

Expand All @@ -65,6 +82,52 @@ fn new_for_fp_rate<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
Ok((atoms::ok(), resource).encode(env))
}

fn new_forgetful<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
let bitmap_size: i64 = args[0].decode()?;
let items_count: i64 = args[1].decode()?;
let capacity: usize = args[2].decode()?;
let rotate_at: usize = args[3].decode()?;

let mut forgetfulfilter = ForgetfulFilter {
filter: Vec::with_capacity(capacity),
rotate_at: rotate_at,
insertion_count: 0
};

for _x in 0..capacity {
forgetfulfilter.filter.push(Bloom::new(bitmap_size as usize, items_count as usize))
}

let resource = ResourceArc::new(ForgetfulFilterResource {
filter: RwLock::new(forgetfulfilter),
});

Ok((atoms::ok(), resource.encode(env)).encode(env))
}

fn new_forgetful_for_fp_rate<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
let items_count: i64 = args[0].decode()?;
let fp_p: f64 = args[1].decode()?;
let capacity: usize = args[2].decode()?;
let rotate_at: usize = args[3].decode()?;

let mut forgetfulfilter = ForgetfulFilter {
filter: Vec::with_capacity(capacity),
rotate_at: rotate_at,
insertion_count: 0
};

for _x in 0..capacity {
forgetfulfilter.filter.push(Bloom::new_for_fp_rate(items_count as usize, fp_p))
}

let resource = ResourceArc::new(ForgetfulFilterResource {
filter: RwLock::new(forgetfulfilter),
});

Ok((atoms::ok(), resource.encode(env)).encode(env))
}

fn serialize<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
let resource: ResourceArc<FilterResource> = args[0].decode()?;

Expand Down Expand Up @@ -122,6 +185,47 @@ fn set<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
Ok(atoms::ok().encode(env))
}

fn set_forgetful<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
let resource: ResourceArc<ForgetfulFilterResource> = args[0].decode()?;
let key: Binary = if args[1].is_binary() {
args[1].decode()?
} else {
Binary::from_owned(args[1].to_binary(), env)
};

let mut filter = resource.filter.write().unwrap();

// check membership
let mut member = false;
// check the overlapping blooms 2 by 2
for x in 0..filter.filter.len() - 2 {
if filter.filter[x].check(&key) && filter.filter[x+1].check(&key) {
member = true;
break;
}
}
if !member {
// check last bloom
member = filter.filter[filter.filter.len() - 1].check(&key);
}

if !member {
filter.insertion_count+=1;
if filter.insertion_count >= filter.rotate_at {
filter.insertion_count = 0;
// rotate the oldest bloom to the start of the list
// and clear it
filter.filter.rotate_right(1);
filter.filter[0].clear();
}
// set in the future and current
filter.filter[0].set(&key);
filter.filter[1].set(&key);
}

Ok(member.encode(env))
}

fn check<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
let resource: ResourceArc<FilterResource> = args[0].decode()?;
let key: Binary = if args[1].is_binary() {
Expand All @@ -135,6 +239,26 @@ fn check<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
Ok(filter.check(&key).encode(env))
}

fn check_forgetful<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
let resource: ResourceArc<ForgetfulFilterResource> = args[0].decode()?;
let key: Binary = if args[1].is_binary() {
args[1].decode()?
} else {
Binary::from_owned(args[1].to_binary(), env)
};

let filter = resource.filter.read().unwrap();

// check the overlapping blooms 2 by 2
for x in 0..filter.filter.len() - 2 {
if filter.filter[x].check(&key) && filter.filter[x+1].check(&key) {
return Ok(true.encode(env))
}
}
// check last bloom
return Ok(filter.filter[filter.filter.len() - 1].check(&key).encode(env))
}

// check a serialized bloom for key membership without fully deserializing the bloom
// specifically we want to avoid the very slow bitvec deserialization and simply compute
// the hash keys manually and check them inside the Erlang binary by hand
Expand Down Expand Up @@ -204,3 +328,15 @@ fn clear<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {

Ok(atoms::ok().encode(env))
}

fn clear_forgetful<'a>(env: Env<'a>, args: &[Term<'a>]) -> NifResult<Term<'a>> {
let resource: ResourceArc<ForgetfulFilterResource> = args[0].decode()?;

let mut filter = resource.filter.write().unwrap();
for x in 0..filter.filter.len() -1 {
filter.filter[x].clear();
}
filter.insertion_count = 0;

Ok(atoms::ok().encode(env))
}
27 changes: 26 additions & 1 deletion src/bloom.erl
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,20 @@
%% API
-export([new/2,
new_for_fp_rate/2,
new_forgetful/4,
new_forgetful_for_fp_rate/4,
serialize/1,
deserialize/1,
deserialize/7,
to_bin/1,
from_bin/1,
set/2,
set_forgetful/2,
check/2,
check_forgetful/2,
check_and_set/2,
clear/1
clear/1,
clear_forgetful/1
]).

%% Native library support
Expand Down Expand Up @@ -41,6 +46,14 @@ new(_BitmapSize, _ItemsCount) ->
new_for_fp_rate(_ItemsCount, _FP_Rate) ->
not_loaded(?LINE).

%% @private
new_forgetful(_BitmapSize, _ItemCount, _Capacity, _RotateAfter) ->
not_loaded(?LINE).

%% @private
new_forgetful_for_fp_rate(_ItemCount, _FalsePositiveRate, _Capacity, _RotateAfter) ->
not_loaded(?LINE).

%% @doc Serialize a bloom filter to Erlang terms. `check/2' can be used against this serialized form efficently.
-spec serialize(Bloom :: bloom()) -> {ok, serialized_bloom()}.
serialize(_Ref) ->
Expand Down Expand Up @@ -75,6 +88,10 @@ deserialize(_Bitmap, _NumBits, _NumFuns, _Sv00, _Sv01, _Sv10, _Sv11) ->
set(_Ref, _Key) ->
not_loaded(?LINE).

%% @private
set_forgetful(_Bloom, _Key) ->
not_loaded(?LINE).

%% @doc Check for the presence of `Key' in `Bloom'.
%% Serialized and binary encoded bloom filters can be used with this
%% function when you wish to check for the key and do not need to use set
Expand All @@ -89,6 +106,10 @@ check(<<?ERBLOOM_VERSION1:8/integer, NumBits:64/integer-unsigned-little, NumFuns
check({Bitmap,NumBits,NumFuns,{Sv00,Sv01},{Sv10,Sv11}}, Key) ->
check_nif(Bitmap, NumBits, NumFuns, Sv00, Sv01, Sv10, Sv11, Key).

%% @private
check_forgetful(_Bloom, _Key) ->
not_loaded(?LINE).

%% @doc Record the presence of `Key' in `Bloom' and return whether it was present before.
-spec check_and_set(Bloom :: bloom(), Key :: term()) -> boolean().
check_and_set(_Ref, _Key) ->
Expand All @@ -99,6 +120,10 @@ check_and_set(_Ref, _Key) ->
clear(_Ref) ->
not_loaded(?LINE).

%% @private
clear_forgetful(_Ref) ->
not_loaded(?LINE).

check_nif(_Ref, _Key) ->
not_loaded(?LINE).

Expand Down
43 changes: 43 additions & 0 deletions src/forgetful_bloom.erl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
%% @doc This is an implementation of Forgetful Bloom Filters [http://dprg.cs.uiuc.edu/docs/fbf_cac15/fbfpaper-2.pdf] built on top of [https://crates.io/crates/bloomfilter]

-module(forgetful_bloom).
%% API
-export([new/4,
new_for_fp_rate/4,
set/2,
check/2,
clear/1
]).


-opaque bloom() :: reference().
-export_type([bloom/0]).

%% @doc Create a new forgetful bloom filter structure. `BitmapSize' is the size in bytes (not bits) that will be allocated in memory `ItemsCount' is an estimation of the maximum number of items to store, `NumFilters' is the number of filters to maintain (minimum of 3) and `RotateAfter' is how many insertions to do into a filter before rotating a blank filter into the `future' position.
-spec new(BitmapSize :: pos_integer(), ItemsCount :: pos_integer(), NumFilters :: pos_integer(), RotateAfter :: pos_integer()) -> {ok, Bloom :: bloom()}.
new(BitmapSize, ItemsCount, NumFilters, RotateAfter) when NumFilters > 2 ->
bloom:new_forgetful(BitmapSize, ItemsCount, NumFilters, RotateAfter).

%% @doc Create a new forgetful bloom filter structure. `ItemsCount' is an estimation of the maximum number of items to store. `FalsePositiveRate' is the wanted rate of false positives, in ]0.0, 1.0[, `NumFilters' is the number of filters to maintain (minimum of 3) and `RotateAfter' is how many insertions to do into a filter before rotating a blank filter into the `future' position.
-spec new_for_fp_rate(ItemsCount :: pos_integer(), FalsePositiveRate :: float(), NumFilters :: pos_integer(), RotateAfter :: pos_integer()) -> {ok, Bloom :: bloom()}.
new_for_fp_rate(ItemsCount, FP_Rate, NumFilters, RotateAfter) ->
bloom:new_forgetful_for_fp_rate(ItemsCount, FP_Rate, NumFilters, RotateAfter).

%% @doc Record the presence of `Key' in `ForgetfulBloom'. Like `bloom:check_and_set/2' a boolean is returned to indicate if the value was already present.
%% @see bloom:check_and_set/2
-spec set(ForgetfulBloom :: bloom(), Key :: term()) -> WasAlreadyPresent :: boolean().
set(Ref, Key) ->
bloom:set_forgetful(Ref, Key).

%% @doc Check for the presence of `Key' in `ForgetfulBloom'.
-spec check(ForgetfulBloom :: bloom(), term()) -> boolean().
check(Ref, Key) ->
bloom:check_forgetful(Ref, Key).

%% @doc Clear all of the bits in the filter, removing all keys from the set.
-spec clear(ForgetfulBloom :: bloom()) -> ok.
clear(Ref) ->
bloom:clear_forgetful(Ref).



26 changes: 25 additions & 1 deletion test/bloom_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ groups() ->
[
{bloom,
[parallel, shuffle],
[new, serialize, deserialize, set, check, check_and_set, clear, to_from_bin]},
[new, serialize, deserialize, set, check, check_and_set, clear, to_from_bin, forgetful]},

{perf,
[shuffle],
Expand Down Expand Up @@ -130,6 +130,30 @@ to_from_bin(_) ->
false = bloom:check(Bin, Key2),
ok.

forgetful(_) ->
{ok, Ref} = forgetful_bloom:new(50,80,3,1),
Key = <<"binkeyfortest">>,
Key2 = <<"binkeyfortestingmore">>,
Key3 = <<"icanseemygousefromhere">>,
Key4 = <<"icantbelieveitsnotbutter">>,
Key5 = <<"myhovercraftisfullofeels">>,
Key6 = <<"ivefallenandicantgetup">>,
Keys = [Key, Key2, Key3, Key4, Key5, Key6],
[] = [K || K <- Keys, forgetful_bloom:check(Ref, K)],
false = forgetful_bloom:set(Ref, Key),
?assertEqual([Key], [K || K <- Keys, forgetful_bloom:check(Ref, K)]),
false = forgetful_bloom:set(Ref, Key2),
?assertEqual([Key, Key2], [K || K <- Keys, forgetful_bloom:check(Ref, K)]),
false = forgetful_bloom:set(Ref, Key3),
?assertEqual([Key, Key2, Key3], [K || K <- Keys, forgetful_bloom:check(Ref, K)]),
false = forgetful_bloom:set(Ref, Key4),
?assertEqual([Key2, Key3, Key4], [K || K <- Keys, forgetful_bloom:check(Ref, K)]),
false = forgetful_bloom:set(Ref, Key5),
?assertEqual([Key3, Key4, Key5], [K || K <- Keys, forgetful_bloom:check(Ref, K)]),
false = forgetful_bloom:set(Ref, Key6),
?assertEqual([Key4, Key5, Key6], [K || K <- Keys, forgetful_bloom:check(Ref, K)]),
ok.

%% =============================================================================
%% group: perf
%% =============================================================================
Expand Down

0 comments on commit b51e77b

Please sign in to comment.