1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
use std::marker::PhantomData;
use std::sync::Arc;
/// A Bloom Filter, a probabilistic set.
/// Elements may be added to the filter, and then the filter may be tested for membership, with
/// false positives. The false positivity rate is determined by the size of the Bloom filter and
/// the number of hashes.
pub struct BloomFilter<T, F> {
masks: Vec<u8>,
hashes: Arc<Vec<F>>,
_phantom: PhantomData<T>,
}
impl<T, F> BloomFilter<T, F>
where
F: Fn(&T) -> usize + Send + Sync,
{
/// Create a new Bloom filter, with the given size in bits and the given list of hashes to be
/// applied to all members on addition and query.
pub fn new(bits: usize, hashes: Vec<F>) -> BloomFilter<T, F> {
BloomFilter {
masks: vec![0; bits >> 3],
hashes: Arc::new(hashes),
_phantom: PhantomData,
}
}
/// Add `elem` to the Bloom filter.
pub fn add(&mut self, elem: &T) {
self.hashes.iter().for_each(|hash| {
let h = hash(elem);
self.masks[h >> 3] |= 1 << (h & 0b111);
});
}
/// True if `elem` is in the set.
/// If `elem` is not in the set, this method returns False; i.e., this method return false
/// positives, but not false negatives.
pub fn is_member_prob(&self, elem: &T) -> bool {
self.hashes.iter().all(|hash| {
let h = hash(elem);
self.masks[h >> 3] & (1 << (h & 0b111)) != 0
})
}
/// True if `elem` is in the set, lazily confirming the result with the `confirm` closure to
/// guard against false positives.
pub fn is_member<G>(&self, elem: &T, confirm: G) -> bool
where
G: Fn(&T) -> bool,
{
self.is_member_prob(elem) && confirm(elem)
}
/// Modifies `self` to include elements from `other`.
/// The false positivity rate of the resultant bloom filter will be greater than or equal to
/// the maximum of the false positivity rates of the two operands.
pub fn union(&mut self, other: &Self) {
let l = usize::max(self.masks.len(), other.masks.len());
for i in 0..l {
self.masks[i] |= other.masks[i];
}
}
}
impl<T, F> Clone for BloomFilter<T, F> {
fn clone(&self) -> BloomFilter<T, F> {
BloomFilter {
masks: self.masks.clone(),
hashes: Arc::clone(&self.hashes),
_phantom: PhantomData,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use libbgs_util::*;
#[test]
fn test_bloom_filter() {
let mut hashes = Vec::<Box<dyn Fn(&u128) -> usize + Send + Sync>>::new();
hashes.push(Box::new(|x| (x % 10_000) as usize));
hashes.push(Box::new(|x| ((x >> 32) % 10_000) as usize));
let mut filter = BloomFilter::<u128, _>::new(10_000, hashes);
for i in 100_000..101_000 {
filter.add(&intpow::<0>(i * 1000 + i * 10 + i, 2));
}
for i in 100_000..100_500 {
let x = intpow::<0>(i * 1000 + i * 10 + i, 2);
let check = filter.is_member_prob(&x);
assert!(check);
}
let mut all = true;
for i in 1_501..2_000 {
let x = intpow::<0>(i * 1000 + i * 10 + i, 2);
all &= filter.is_member_prob(&x);
if !all {
break;
}
}
assert!(!all);
}
}