1
0
Fork 0
mirror of https://github.com/dragonflydb/dragonfly.git synced 2024-12-14 11:58:02 +00:00

chore(search): Block list (#2307)

chore(search): Block list

---------

Signed-off-by: Vladislav Oleshko <vlad@dragonflydb.io>
This commit is contained in:
Vladislav 2023-12-24 17:42:03 +03:00 committed by GitHub
parent 8bd43497f2
commit d129674e17
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 480 additions and 38 deletions

View file

@ -3,11 +3,13 @@ gen_bison(parser)
cur_gen_dir(gen_dir)
add_library(query_parser base.cc ast_expr.cc query_driver.cc search.cc indices.cc sort_indices.cc vector_utils.cc
compressed_sorted_set.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
add_library(query_parser base.cc ast_expr.cc query_driver.cc search.cc indices.cc
sort_indices.cc vector_utils.cc compressed_sorted_set.cc block_list.cc
${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
target_link_libraries(query_parser base absl::strings TRDP::reflex TRDP::uni-algo TRDP::hnswlib)
cxx_test(compressed_sorted_set_test query_parser LABELS DFLY)
cxx_test(block_list_test query_parser LABELS DFLY)
cxx_test(search_parser_test query_parser LABELS DFLY)
cxx_test(search_test query_parser LABELS DFLY)

View file

@ -0,0 +1,134 @@
#include "core/search/block_list.h"
namespace dfly::search {
using namespace std;
template <typename C> bool BlockList<C>::Insert(DocId t) {
auto block = FindBlock(t);
if (block == blocks_.end())
block = blocks_.insert(blocks_.end(), C{blocks_.get_allocator().resource()});
if (!block->Insert(t))
return false;
size_++;
TrySplit(block);
return true;
}
template <typename C> bool BlockList<C>::Remove(DocId t) {
if (auto block = FindBlock(t); block != blocks_.end() && block->Remove(t)) {
size_--;
TryMerge(block);
return true;
}
return false;
}
template <typename C> typename BlockList<C>::BlockIt BlockList<C>::FindBlock(DocId t) {
DCHECK(blocks_.empty() || blocks_.back().Size() > 0u);
if (!blocks_.empty() && t >= *blocks_.back().begin())
return --blocks_.end();
// Find first block that can't contain t
auto it = std::upper_bound(blocks_.begin(), blocks_.end(), t,
[](DocId t, const C& l) { return *l.begin() > t; });
// Move to previous if possible
if (it != blocks_.begin())
--it;
DCHECK(it == blocks_.begin() || it->Size() > 0);
DCHECK(it == blocks_.begin() || it == blocks_.begin() || it->Size() * 2 >= block_size_);
DCHECK(it == blocks_.end() || it->Size() <= 2 * block_size_);
return it;
}
template <typename C> void BlockList<C>::TryMerge(BlockIt block) {
if (block->Size() == 0) {
blocks_.erase(block);
return;
}
if (block->Size() >= block_size_ / 2 || block == blocks_.begin())
return;
// Merge strictly right with left to benefit from tail insert optimizations
size_t idx = std::distance(blocks_.begin(), block);
blocks_[idx - 1].Merge(std::move(*block));
blocks_.erase(block);
TrySplit(blocks_.begin() + (idx - 1)); // to not overgrow it
}
template <typename C> void BlockList<C>::TrySplit(BlockIt block) {
if (block->Size() < block_size_ * 2)
return;
auto [left, right] = std::move(*block).Split();
*block = std::move(right);
blocks_.insert(block, std::move(left));
}
template <typename C>
typename BlockList<C>::BlockListIterator& BlockList<C>::BlockListIterator::operator++() {
++*block_it;
if (block_it == block_end) {
++it;
if (it != it_end) {
block_it = it->begin();
block_end = it->end();
} else {
block_it = std::nullopt;
block_end = std::nullopt;
}
}
return *this;
}
template class BlockList<CompressedSortedSet>;
template class BlockList<SortedVector>;
bool SortedVector::Insert(DocId t) {
if (entries_.size() > 0 && t > entries_.back()) {
entries_.push_back(t);
return true;
}
auto it = std::lower_bound(entries_.begin(), entries_.end(), t);
if (it != entries_.end() && *it == t)
return false;
entries_.insert(it, t);
return true;
}
bool SortedVector::Remove(DocId t) {
auto it = std::lower_bound(entries_.begin(), entries_.end(), t);
if (it != entries_.end() && *it == t) {
entries_.erase(it);
return true;
}
return false;
}
void SortedVector::Merge(SortedVector&& other) {
// NLog compexity in theory, but in practice used only to merge with larger values.
// Tail insert optimization makes it linear
entries_.reserve(entries_.size() + other.entries_.size());
for (int t : other.entries_)
Insert(t);
}
std::pair<SortedVector, SortedVector> SortedVector::Split() && {
PMR_NS::vector<DocId> tail(entries_.begin() + entries_.size() / 2, entries_.end());
entries_.resize(entries_.size() / 2);
return std::make_pair(std::move(*this), SortedVector{std::move(tail)});
}
} // namespace dfly::search

View file

@ -0,0 +1,133 @@
#pragma once
#include <absl/types/span.h>
#include <algorithm>
#include <cstdint>
#include <iterator>
#include <optional>
#include <vector>
#include "core/search/base.h"
#include "core/search/compressed_sorted_set.h"
namespace dfly::search {
// BlockList is a container wrapper for CompressedSortedSet / vector<DocId>
// to divide the full sorted id range into separate blocks. This reduces modification
// complexity from O(N) to O(logN + K), where K is the max block size.
//
// It tries to balance block sizes in the range [block_size / 2, block_size * 2]
// by splitting or merging nodes when needed.
template <typename Container /* underlying container */> class BlockList {
using BlockIt = typename PMR_NS::vector<Container>::iterator;
using ConstBlockIt = typename PMR_NS::vector<Container>::const_iterator;
public:
BlockList(PMR_NS::memory_resource* mr, size_t block_size = 1000)
: block_size_{block_size}, blocks_(mr) {
}
// Insert element, returns true if inserted, false if already present.
bool Insert(DocId t);
// Remove element, returns true if removed, false if not found.
bool Remove(DocId t);
size_t Size() const {
return size_;
}
size_t size() const {
return size_;
}
struct BlockListIterator {
// To make it work with std container contructors
using iterator_category = std::forward_iterator_tag;
using difference_type = std::ptrdiff_t;
using value_type = DocId;
using pointer = DocId*;
using reference = DocId&;
DocId operator*() const {
return **block_it;
}
BlockListIterator& operator++();
friend class BlockList;
bool operator==(const BlockListIterator& other) const {
return it == other.it && block_it == other.block_it;
}
bool operator!=(const BlockListIterator& other) const {
return !operator==(other);
}
private:
BlockListIterator(ConstBlockIt begin, ConstBlockIt end) : it(begin), it_end(end) {
if (it != it_end) {
block_it = it->begin();
block_end = it->end();
}
}
ConstBlockIt it, it_end;
std::optional<typename Container::iterator> block_it, block_end;
};
BlockListIterator begin() const {
return BlockListIterator{blocks_.begin(), blocks_.end()};
}
BlockListIterator end() const {
return BlockListIterator{blocks_.end(), blocks_.end()};
}
private:
// Find block that should contain t. Returns end() only if empty
BlockIt FindBlock(DocId t);
void TryMerge(BlockIt block); // If needed, merge with previous block
void TrySplit(BlockIt block); // If needed, split into two blocks
private:
const size_t block_size_ = 1000;
size_t size_ = 0;
PMR_NS::vector<Container> blocks_;
};
// Supports Insert and Remove operations for keeping a sorted vector internally.
// Wrapper to use vectors with BlockList
struct SortedVector {
explicit SortedVector(PMR_NS::memory_resource* mr) : entries_(mr) {
}
bool Insert(DocId t);
bool Remove(DocId t);
void Merge(SortedVector&& other);
std::pair<SortedVector, SortedVector> Split() &&;
size_t Size() {
return entries_.size();
}
using iterator = typename PMR_NS::vector<DocId>::const_iterator;
iterator begin() const {
return entries_.cbegin();
}
iterator end() const {
return entries_.cend();
}
private:
SortedVector(PMR_NS::vector<DocId>&& v) : entries_{std::move(v)} {
}
PMR_NS::vector<DocId> entries_;
};
} // namespace dfly::search

View file

@ -0,0 +1,123 @@
// Copyright 2023, DragonflyDB authors. All rights reserved.
// See LICENSE for licensing terms.
//
#include "core/search/block_list.h"
#include <absl/container/btree_set.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <algorithm>
#include <set>
#include "base/gtest.h"
#include "base/logging.h"
namespace dfly::search {
using namespace std;
template <typename C> class BlockListTest : public testing::Test {
public:
auto Make() {
// Create list with small block size to test blocking mechanism more extensively
return BlockList<C>{PMR_NS::get_default_resource(), 10};
}
};
using ContainerTypes = ::testing::Types<CompressedSortedSet, SortedVector>;
TYPED_TEST_SUITE(BlockListTest, ContainerTypes);
TYPED_TEST(BlockListTest, LoopMidInsertErase) {
const size_t kNumElements = 50;
auto list = this->Make();
for (size_t i = 0; i < kNumElements / 2; i++) {
list.Insert(i);
list.Insert(i + kNumElements / 2);
}
vector<int> out(list.begin(), list.end());
ASSERT_EQ(list.size(), kNumElements);
ASSERT_EQ(out.size(), kNumElements);
for (size_t i = 0; i < kNumElements; i++)
ASSERT_EQ(out[i], i);
for (size_t i = 0; i < kNumElements / 2; i++) {
list.Remove(i);
list.Remove(i + kNumElements / 2);
}
out = {list.begin(), list.end()};
EXPECT_EQ(out.size(), 0u);
}
TYPED_TEST(BlockListTest, InsertReverseRemoveSteps) {
const size_t kNumElements = 1000;
auto list = this->Make();
for (size_t i = 0; i < kNumElements; i++) {
list.Insert(kNumElements - i - 1);
}
for (size_t deleted_pref = 0; deleted_pref < 10; deleted_pref++) {
vector<DocId> out{list.begin(), list.end()};
reverse(out.begin(), out.end());
EXPECT_EQ(out.size(), kNumElements / 10 * (10 - deleted_pref));
for (size_t i = 0; i < kNumElements; i++) {
if (i % 10 >= deleted_pref) {
EXPECT_EQ(out.back(), DocId(i));
out.pop_back();
}
}
for (size_t i = 0; i < kNumElements; i++) {
if (i % 10 == deleted_pref)
list.Remove(i);
}
}
EXPECT_EQ(list.size(), 0u);
}
TYPED_TEST(BlockListTest, RandomNumbers) {
const size_t kNumIterations = 1'000;
auto list = this->Make();
std::set<DocId> list_copy;
for (size_t i = 0; i < kNumIterations; i++) {
if (list_copy.size() > 100 && rand() % 5 == 0) {
auto it = list_copy.begin();
std::advance(it, rand() % list_copy.size());
list.Remove(*it);
list_copy.erase(it);
} else {
DocId t = rand() % 1'000'000;
list.Insert(t);
list_copy.insert(t);
}
ASSERT_TRUE(std::equal(list.begin(), list.end(), list_copy.begin(), list_copy.end()));
}
}
static void BM_Erase90PctTail(benchmark::State& state) {
BlockList<CompressedSortedSet> bl{PMR_NS::get_default_resource()};
unsigned size = state.range(0);
for (size_t i = 0; i < size; i++)
bl.Insert(i);
size_t base = size / 10;
size_t i = 0;
while (state.KeepRunning()) {
benchmark::DoNotOptimize(bl.Remove(base + i));
i = (i + 1) % (size * 9 / 10);
}
}
BENCHMARK(BM_Erase90PctTail)->Args({100'000});
} // namespace dfly::search

View file

@ -97,27 +97,27 @@ CompressedSortedSet::EntryLocation CompressedSortedSet::LowerBound(IntType value
// needs to be inserted. Then it computes the differences dif1 = V - A and diff2 = B - V that need
// to be stored to encode the triple A V B. Those are stored where diff0 = B - A was previously
// stored, possibly extending the vector
void CompressedSortedSet::Insert(IntType value) {
bool CompressedSortedSet::Insert(IntType value) {
if (tail_value_ && *tail_value_ == value)
return;
return false;
if (tail_value_ && value > *tail_value_) {
PushBackDiff(value - *tail_value_);
tail_value_ = value;
return;
return true;
}
auto bound = LowerBound(value);
// At least one element was read and it's equal to value: return to avoid duplicate
if (bound.value == value && !bound.diff_span.empty())
return;
return false;
// Value is bigger than any other (or list is empty): append required diff at the end
if (value > bound.value || bound.diff_span.empty()) {
PushBackDiff(value - bound.value);
tail_value_ = value;
return;
return true;
}
size_++;
@ -141,17 +141,19 @@ void CompressedSortedSet::Insert(IntType value) {
// Now overwrite diff0 and 0s with the two new differences
copy(diff1_span.begin(), diff1_span.end(), diffs_.begin() + diff_offset);
copy(diff2_span.begin(), diff2_span.end(), diffs_.begin() + diff_offset + diff1_span.size());
return true;
}
// Remove has linear complexity. It tries to find the element V and its neighbors A and B,
// which are encoded as diff1 = V - A and diff2 = B - V. Adjacently stored diff1 and diff2
// need to be replaced with diff3 = diff1 + diff2s
void CompressedSortedSet::Remove(IntType value) {
bool CompressedSortedSet::Remove(IntType value) {
auto bound = LowerBound(value);
// Nothing was read or the element was not found
if (bound.diff_span.empty() || bound.value != value)
return;
return false;
// We're removing below unconditionally
size_--;
@ -166,7 +168,7 @@ void CompressedSortedSet::Remove(IntType value) {
tail_value_ = bound.prev_value;
if (diffs_.empty())
tail_value_ = nullopt;
return;
return true;
}
// Now the list certainly contains a succeeding element B > V and possibly A < V (or 0)
@ -185,6 +187,8 @@ void CompressedSortedSet::Remove(IntType value) {
// Overwrite diff1/diff2 with new diff3
copy(diff3_buf.begin(), diff3_buf.end(), diffs_.begin() + diff_offset);
return true;
}
size_t CompressedSortedSet::Size() const {
@ -195,6 +199,35 @@ size_t CompressedSortedSet::ByteSize() const {
return diffs_.size();
}
void CompressedSortedSet::Merge(CompressedSortedSet&& other) {
// Quadratic compexity in theory, but in practice used only to merge with larger values.
// Tail insert optimization makes it linear
for (int v : other)
Insert(v);
}
std::pair<CompressedSortedSet, CompressedSortedSet> CompressedSortedSet::Split() && {
DCHECK_GT(Size(), 5u);
CompressedSortedSet second(diffs_.get_allocator().resource());
// Move iterator to middle position and save size of diffs tail
auto it = begin();
std::advance(it, size_ / 2);
size_t keep_bytes = it.last_read_.data() - diffs_.data();
// Copy second half into second set
for (; it != end(); ++it)
second.Insert(*it);
// Erase diffs tail
diffs_.resize(keep_bytes);
tail_value_ = std::nullopt;
size_ -= second.Size();
return std::make_pair(std::move(*this), std::move(second));
}
// The leftmost three bits of the first byte store the number of additional bytes. All following
// bits store the number itself.
absl::Span<uint8_t> CompressedSortedSet::WriteVarLen(IntType value, absl::Span<uint8_t> buf) {

View file

@ -7,6 +7,7 @@
#include <optional>
#include <vector>
#include "base/logging.h"
#include "base/pmr/memory_resource.h"
#include "core/search/base.h"
@ -48,7 +49,7 @@ class CompressedSortedSet {
absl::Span<const uint8_t> diffs_{};
};
friend struct Iterator;
using iterator = ConstIterator;
public:
explicit CompressedSortedSet(PMR_NS::memory_resource* mr);
@ -56,16 +57,17 @@ class CompressedSortedSet {
ConstIterator begin() const;
ConstIterator end() const;
void Insert(IntType value); // Insert arbitrary element, needs to scan whole list
void Remove(IntType value); // Remove arbitrary element, needs to scan whole list
bool Insert(IntType value); // Insert arbitrary element, needs to scan whole list
bool Remove(IntType value); // Remove arbitrary element, needs to scan whole list
size_t Size() const;
size_t ByteSize() const;
// To use transparently in templates together with stl containers
size_t size() const {
return Size();
}
// Add all values from other
void Merge(CompressedSortedSet&& other);
// Split into two equally sized halves
std::pair<CompressedSortedSet, CompressedSortedSet> Split() &&;
private:
struct EntryLocation {
@ -90,6 +92,7 @@ class CompressedSortedSet {
private:
uint32_t size_{0};
IntType head_value_{0};
std::optional<IntType> tail_value_{};
std::vector<uint8_t, PMR_NS::polymorphic_allocator<uint8_t>> diffs_;
};

View file

@ -4,10 +4,13 @@
#include "core/search/compressed_sorted_set.h"
#include <absl/container/btree_set.h>
#include <algorithm>
#include "base/gtest.h"
#include "base/logging.h"
#include "core/bptree_set.h"
namespace dfly::search {

View file

@ -88,10 +88,12 @@ vector<DocId> NumericIndex::Range(double l, double r) const {
return out;
}
BaseStringIndex::BaseStringIndex(PMR_NS::memory_resource* mr) : entries_{mr} {
template <typename C>
BaseStringIndex<C>::BaseStringIndex(PMR_NS::memory_resource* mr) : entries_{mr} {
}
const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const {
template <typename C>
const typename BaseStringIndex<C>::Container* BaseStringIndex<C>::Matching(string_view str) const {
str = absl::StripAsciiWhitespace(str);
string word;
@ -104,12 +106,14 @@ const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const {
return (it != entries_.end()) ? &it->second : nullptr;
}
CompressedSortedSet* BaseStringIndex::GetOrCreate(string_view word) {
template <typename C>
typename BaseStringIndex<C>::Container* BaseStringIndex<C>::GetOrCreate(string_view word) {
auto* mr = entries_.get_allocator().resource();
return &entries_.try_emplace(PMR_NS::string{word, mr}, mr).first->second;
return &entries_.try_emplace(PMR_NS::string{word, mr}, mr, 1000 /* block size */).first->second;
}
void BaseStringIndex::Add(DocId id, DocumentAccessor* doc, string_view field) {
template <typename C>
void BaseStringIndex<C>::Add(DocId id, DocumentAccessor* doc, string_view field) {
absl::flat_hash_set<std::string> tokens;
for (string_view str : doc->GetStrings(field))
tokens.merge(Tokenize(str));
@ -118,7 +122,8 @@ void BaseStringIndex::Add(DocId id, DocumentAccessor* doc, string_view field) {
GetOrCreate(token)->Insert(id);
}
void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field) {
template <typename C>
void BaseStringIndex<C>::Remove(DocId id, DocumentAccessor* doc, string_view field) {
absl::flat_hash_set<std::string> tokens;
for (string_view str : doc->GetStrings(field))
tokens.merge(Tokenize(str));
@ -134,6 +139,9 @@ void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field)
}
}
template struct BaseStringIndex<CompressedSortedSet>;
template struct BaseStringIndex<SortedVector>;
absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
return TokenizeWords(value);
}

View file

@ -13,6 +13,7 @@
#include "base/pmr/memory_resource.h"
#include "core/search/base.h"
#include "core/search/block_list.h"
#include "core/search/compressed_sorted_set.h"
// TODO: move core field definitions out of big header
@ -36,7 +37,9 @@ struct NumericIndex : public BaseIndex {
};
// Base index for string based indices.
struct BaseStringIndex : public BaseIndex {
template <typename C> struct BaseStringIndex : public BaseIndex {
using Container = BlockList<C>;
BaseStringIndex(PMR_NS::memory_resource* mr);
void Add(DocId id, DocumentAccessor* doc, std::string_view field) override;
@ -46,10 +49,10 @@ struct BaseStringIndex : public BaseIndex {
virtual absl::flat_hash_set<std::string> Tokenize(std::string_view value) const = 0;
// Pointer is valid as long as index is not mutated. Nullptr if not found
const CompressedSortedSet* Matching(std::string_view str) const;
const Container* Matching(std::string_view str) const;
protected:
CompressedSortedSet* GetOrCreate(std::string_view word);
Container* GetOrCreate(std::string_view word);
struct PmrEqual {
using is_transparent = void;
@ -71,14 +74,14 @@ struct BaseStringIndex : public BaseIndex {
}
};
absl::flat_hash_map<PMR_NS::string, CompressedSortedSet, PmrHash, PmrEqual,
PMR_NS::polymorphic_allocator<std::pair<PMR_NS::string, CompressedSortedSet>>>
absl::flat_hash_map<PMR_NS::string, Container, PmrHash, PmrEqual,
PMR_NS::polymorphic_allocator<std::pair<PMR_NS::string, Container>>>
entries_;
};
// Index for text fields.
// Hashmap based lookup per word.
struct TextIndex : public BaseStringIndex {
struct TextIndex : public BaseStringIndex<CompressedSortedSet> {
TextIndex(PMR_NS::memory_resource* mr) : BaseStringIndex(mr) {
}
@ -87,7 +90,7 @@ struct TextIndex : public BaseStringIndex {
// Index for text fields.
// Hashmap based lookup per word.
struct TagIndex : public BaseStringIndex {
struct TagIndex : public BaseStringIndex<SortedVector> {
TagIndex(PMR_NS::memory_resource* mr) : BaseStringIndex(mr) {
}

View file

@ -45,20 +45,18 @@ AstExpr ParseQuery(std::string_view query, const QueryParams* params) {
// Represents an either owned or non-owned result set that can be accessed transparently.
struct IndexResult {
using DocVec = vector<DocId>;
using BorrowedView = variant<const DocVec*, const CompressedSortedSet*>;
using BorrowedView =
variant<const DocVec*, const BlockList<CompressedSortedSet>*, const BlockList<SortedVector>*>;
IndexResult() : value_{DocVec{}} {
}
IndexResult(const CompressedSortedSet* css) : value_{css} {
if (css == nullptr)
value_ = DocVec{};
}
IndexResult(DocVec&& dv) : value_{std::move(dv)} {
}
IndexResult(const DocVec* dv) : value_{dv} {
template <typename C> IndexResult(const C* container = nullptr) : value_{container} {
if (container == nullptr)
value_ = DocVec{};
}
size_t Size() const {
@ -108,7 +106,9 @@ struct IndexResult {
}
private:
variant<DocVec /*owned*/, const CompressedSortedSet*, const DocVec*> value_;
variant<DocVec /*owned*/, const DocVec*, const BlockList<CompressedSortedSet>*,
const BlockList<SortedVector>*>
value_;
};
struct ProfileBuilder {