1
0
Fork 0
mirror of https://github.com/dragonflydb/dragonfly.git synced 2024-12-15 17:51:06 +00:00

chore: bloom test - cover corner cases (#2806)

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2024-04-02 09:18:36 +03:00 committed by GitHub
parent d3b90c8210
commit a93ad4e86f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 45 additions and 10 deletions

View file

@ -51,18 +51,17 @@ void Bloom::Init(uint64_t entries, double fp_prob, PMR_NS::memory_resource* heap
CHECK(bf_ == nullptr);
CHECK(fp_prob > 0 && fp_prob < 1);
if (entries < 1024)
entries = 1024;
if (fp_prob > 0.5)
fp_prob = 0.5;
double bpe = BPE(fp_prob);
hash_cnt_ = ceil(M_LN2 * bpe);
uint64_t bits = uint64_t(ceil(entries * bpe));
bits = absl::bit_ceil(bits); // make it power of 2.
if (bits < 1024) {
bits = 1024;
if (bits < 512) {
bits = 512;
}
bits = absl::bit_ceil(bits); // make it power of 2.
uint64_t length = bits / 8;
bf_ = (uint8_t*)heap->allocate(length);
@ -111,6 +110,8 @@ bool Bloom::Add(const uint64_t fp[2]) {
}
size_t Bloom::Capacity(double fp_prob) const {
if (fp_prob > 0.5)
fp_prob = 0.5;
double bpe = BPE(fp_prob);
return floor(bitlen() / bpe);
}
@ -197,4 +198,14 @@ bool SBF::Exists(std::string_view str) const {
return any_of(filters_.crbegin(), filters_.crend(), exists);
}
size_t SBF::MallocUsed() const {
size_t res = filters_.capacity() * sizeof(Bloom);
for (const auto& b : filters_) {
res += (b.bitlen() / 8);
}
res += sizeof(SBF);
return res;
}
} // namespace dfly

View file

@ -50,7 +50,8 @@ class Bloom {
return 1ULL << bit_log_;
}
// Note that max element capacity is floor(bit_len / bpe), where bpe (bits per element) is
// Max element capacity for this bloom filter.
// Note that capacity is floor(bit_len / bpe), where bpe (bits per element) is
// derived from fp_prob.
size_t Capacity(double fp_prob) const;
@ -83,12 +84,23 @@ class SBF {
bool Add(std::string_view str);
bool Exists(std::string_view str) const;
size_t GetSize() const {
return prev_size_ + current_size_;
}
size_t MallocUsed() const;
double grow_factor() const {
return grow_factor_;
}
private:
// multiple filters from the smallest to the largest.
std::vector<Bloom, PMR_NS::polymorphic_allocator<Bloom>> filters_;
double grow_factor_;
double fp_prob_;
size_t current_size_ = 0;
size_t prev_size_ = 0;
size_t max_capacity_;
};

View file

@ -61,19 +61,31 @@ TEST_F(BloomTest, ErrorBound) {
EXPECT_EQ(collisions, 0) << max_capacity;
}
TEST_F(BloomTest, Extreme) {
Bloom b2;
// Init with unreasonable large error probability.
b2.Init(10, 0.999, PMR_NS::get_default_resource());
EXPECT_EQ(512, b2.bitlen()); // minimal bit length, even though requested smaller capacity.
EXPECT_LT(b2.Capacity(0.999), 512); // make sure our element capacity is smaller.
b2.Destroy(PMR_NS::get_default_resource());
}
TEST_F(BloomTest, SBF) {
SBF sbf(10, 0.001, 2, PMR_NS::get_default_resource());
unsigned collisions = 0;
constexpr unsigned kNumElems = 1000000;
constexpr unsigned kNumElems = 2000000;
for (unsigned i = 0; i < kNumElems; ++i) {
if (!sbf.Add(absl::StrCat("item", i))) {
++collisions;
}
}
// TODO: I should revisit the math for error bound computation.
EXPECT_LE(collisions, kNumElems * 0.0015);
// TODO: to revisit the math for deriving number of hash functions for each filter
// according the the SBF paper.
EXPECT_LE(collisions, kNumElems * 0.008);
}
static void BM_BloomExist(benchmark::State& state) {