mirror of
https://github.com/dragonflydb/dragonfly.git
synced 2024-12-14 11:58:02 +00:00
chore(search): Block list (#2307)
chore(search): Block list --------- Signed-off-by: Vladislav Oleshko <vlad@dragonflydb.io>
This commit is contained in:
parent
8bd43497f2
commit
d129674e17
10 changed files with 480 additions and 38 deletions
|
@ -3,11 +3,13 @@ gen_bison(parser)
|
|||
|
||||
cur_gen_dir(gen_dir)
|
||||
|
||||
add_library(query_parser base.cc ast_expr.cc query_driver.cc search.cc indices.cc sort_indices.cc vector_utils.cc
|
||||
compressed_sorted_set.cc ${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
||||
add_library(query_parser base.cc ast_expr.cc query_driver.cc search.cc indices.cc
|
||||
sort_indices.cc vector_utils.cc compressed_sorted_set.cc block_list.cc
|
||||
${gen_dir}/parser.cc ${gen_dir}/lexer.cc)
|
||||
|
||||
target_link_libraries(query_parser base absl::strings TRDP::reflex TRDP::uni-algo TRDP::hnswlib)
|
||||
|
||||
cxx_test(compressed_sorted_set_test query_parser LABELS DFLY)
|
||||
cxx_test(block_list_test query_parser LABELS DFLY)
|
||||
cxx_test(search_parser_test query_parser LABELS DFLY)
|
||||
cxx_test(search_test query_parser LABELS DFLY)
|
||||
|
|
134
src/core/search/block_list.cc
Normal file
134
src/core/search/block_list.cc
Normal file
|
@ -0,0 +1,134 @@
|
|||
#include "core/search/block_list.h"
|
||||
|
||||
namespace dfly::search {
|
||||
|
||||
using namespace std;
|
||||
|
||||
template <typename C> bool BlockList<C>::Insert(DocId t) {
|
||||
auto block = FindBlock(t);
|
||||
if (block == blocks_.end())
|
||||
block = blocks_.insert(blocks_.end(), C{blocks_.get_allocator().resource()});
|
||||
|
||||
if (!block->Insert(t))
|
||||
return false;
|
||||
|
||||
size_++;
|
||||
TrySplit(block);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename C> bool BlockList<C>::Remove(DocId t) {
|
||||
if (auto block = FindBlock(t); block != blocks_.end() && block->Remove(t)) {
|
||||
size_--;
|
||||
TryMerge(block);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename C> typename BlockList<C>::BlockIt BlockList<C>::FindBlock(DocId t) {
|
||||
DCHECK(blocks_.empty() || blocks_.back().Size() > 0u);
|
||||
|
||||
if (!blocks_.empty() && t >= *blocks_.back().begin())
|
||||
return --blocks_.end();
|
||||
|
||||
// Find first block that can't contain t
|
||||
auto it = std::upper_bound(blocks_.begin(), blocks_.end(), t,
|
||||
[](DocId t, const C& l) { return *l.begin() > t; });
|
||||
|
||||
// Move to previous if possible
|
||||
if (it != blocks_.begin())
|
||||
--it;
|
||||
|
||||
DCHECK(it == blocks_.begin() || it->Size() > 0);
|
||||
DCHECK(it == blocks_.begin() || it == blocks_.begin() || it->Size() * 2 >= block_size_);
|
||||
DCHECK(it == blocks_.end() || it->Size() <= 2 * block_size_);
|
||||
return it;
|
||||
}
|
||||
|
||||
template <typename C> void BlockList<C>::TryMerge(BlockIt block) {
|
||||
if (block->Size() == 0) {
|
||||
blocks_.erase(block);
|
||||
return;
|
||||
}
|
||||
|
||||
if (block->Size() >= block_size_ / 2 || block == blocks_.begin())
|
||||
return;
|
||||
|
||||
// Merge strictly right with left to benefit from tail insert optimizations
|
||||
size_t idx = std::distance(blocks_.begin(), block);
|
||||
blocks_[idx - 1].Merge(std::move(*block));
|
||||
blocks_.erase(block);
|
||||
|
||||
TrySplit(blocks_.begin() + (idx - 1)); // to not overgrow it
|
||||
}
|
||||
|
||||
template <typename C> void BlockList<C>::TrySplit(BlockIt block) {
|
||||
if (block->Size() < block_size_ * 2)
|
||||
return;
|
||||
|
||||
auto [left, right] = std::move(*block).Split();
|
||||
|
||||
*block = std::move(right);
|
||||
blocks_.insert(block, std::move(left));
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
typename BlockList<C>::BlockListIterator& BlockList<C>::BlockListIterator::operator++() {
|
||||
++*block_it;
|
||||
if (block_it == block_end) {
|
||||
++it;
|
||||
if (it != it_end) {
|
||||
block_it = it->begin();
|
||||
block_end = it->end();
|
||||
} else {
|
||||
block_it = std::nullopt;
|
||||
block_end = std::nullopt;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
template class BlockList<CompressedSortedSet>;
|
||||
template class BlockList<SortedVector>;
|
||||
|
||||
bool SortedVector::Insert(DocId t) {
|
||||
if (entries_.size() > 0 && t > entries_.back()) {
|
||||
entries_.push_back(t);
|
||||
return true;
|
||||
}
|
||||
|
||||
auto it = std::lower_bound(entries_.begin(), entries_.end(), t);
|
||||
if (it != entries_.end() && *it == t)
|
||||
return false;
|
||||
|
||||
entries_.insert(it, t);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SortedVector::Remove(DocId t) {
|
||||
auto it = std::lower_bound(entries_.begin(), entries_.end(), t);
|
||||
if (it != entries_.end() && *it == t) {
|
||||
entries_.erase(it);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void SortedVector::Merge(SortedVector&& other) {
|
||||
// NLog compexity in theory, but in practice used only to merge with larger values.
|
||||
// Tail insert optimization makes it linear
|
||||
entries_.reserve(entries_.size() + other.entries_.size());
|
||||
for (int t : other.entries_)
|
||||
Insert(t);
|
||||
}
|
||||
|
||||
std::pair<SortedVector, SortedVector> SortedVector::Split() && {
|
||||
PMR_NS::vector<DocId> tail(entries_.begin() + entries_.size() / 2, entries_.end());
|
||||
entries_.resize(entries_.size() / 2);
|
||||
|
||||
return std::make_pair(std::move(*this), SortedVector{std::move(tail)});
|
||||
}
|
||||
|
||||
} // namespace dfly::search
|
133
src/core/search/block_list.h
Normal file
133
src/core/search/block_list.h
Normal file
|
@ -0,0 +1,133 @@
|
|||
#pragma once
|
||||
|
||||
#include <absl/types/span.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <iterator>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "core/search/base.h"
|
||||
#include "core/search/compressed_sorted_set.h"
|
||||
|
||||
namespace dfly::search {
|
||||
// BlockList is a container wrapper for CompressedSortedSet / vector<DocId>
|
||||
// to divide the full sorted id range into separate blocks. This reduces modification
|
||||
// complexity from O(N) to O(logN + K), where K is the max block size.
|
||||
//
|
||||
// It tries to balance block sizes in the range [block_size / 2, block_size * 2]
|
||||
// by splitting or merging nodes when needed.
|
||||
template <typename Container /* underlying container */> class BlockList {
|
||||
using BlockIt = typename PMR_NS::vector<Container>::iterator;
|
||||
using ConstBlockIt = typename PMR_NS::vector<Container>::const_iterator;
|
||||
|
||||
public:
|
||||
BlockList(PMR_NS::memory_resource* mr, size_t block_size = 1000)
|
||||
: block_size_{block_size}, blocks_(mr) {
|
||||
}
|
||||
|
||||
// Insert element, returns true if inserted, false if already present.
|
||||
bool Insert(DocId t);
|
||||
|
||||
// Remove element, returns true if removed, false if not found.
|
||||
bool Remove(DocId t);
|
||||
|
||||
size_t Size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
struct BlockListIterator {
|
||||
// To make it work with std container contructors
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using value_type = DocId;
|
||||
using pointer = DocId*;
|
||||
using reference = DocId&;
|
||||
|
||||
DocId operator*() const {
|
||||
return **block_it;
|
||||
}
|
||||
|
||||
BlockListIterator& operator++();
|
||||
|
||||
friend class BlockList;
|
||||
|
||||
bool operator==(const BlockListIterator& other) const {
|
||||
return it == other.it && block_it == other.block_it;
|
||||
}
|
||||
|
||||
bool operator!=(const BlockListIterator& other) const {
|
||||
return !operator==(other);
|
||||
}
|
||||
|
||||
private:
|
||||
BlockListIterator(ConstBlockIt begin, ConstBlockIt end) : it(begin), it_end(end) {
|
||||
if (it != it_end) {
|
||||
block_it = it->begin();
|
||||
block_end = it->end();
|
||||
}
|
||||
}
|
||||
|
||||
ConstBlockIt it, it_end;
|
||||
std::optional<typename Container::iterator> block_it, block_end;
|
||||
};
|
||||
|
||||
BlockListIterator begin() const {
|
||||
return BlockListIterator{blocks_.begin(), blocks_.end()};
|
||||
}
|
||||
|
||||
BlockListIterator end() const {
|
||||
return BlockListIterator{blocks_.end(), blocks_.end()};
|
||||
}
|
||||
|
||||
private:
|
||||
// Find block that should contain t. Returns end() only if empty
|
||||
BlockIt FindBlock(DocId t);
|
||||
|
||||
void TryMerge(BlockIt block); // If needed, merge with previous block
|
||||
void TrySplit(BlockIt block); // If needed, split into two blocks
|
||||
|
||||
private:
|
||||
const size_t block_size_ = 1000;
|
||||
size_t size_ = 0;
|
||||
PMR_NS::vector<Container> blocks_;
|
||||
};
|
||||
|
||||
// Supports Insert and Remove operations for keeping a sorted vector internally.
|
||||
// Wrapper to use vectors with BlockList
|
||||
struct SortedVector {
|
||||
explicit SortedVector(PMR_NS::memory_resource* mr) : entries_(mr) {
|
||||
}
|
||||
|
||||
bool Insert(DocId t);
|
||||
bool Remove(DocId t);
|
||||
void Merge(SortedVector&& other);
|
||||
std::pair<SortedVector, SortedVector> Split() &&;
|
||||
|
||||
size_t Size() {
|
||||
return entries_.size();
|
||||
}
|
||||
|
||||
using iterator = typename PMR_NS::vector<DocId>::const_iterator;
|
||||
|
||||
iterator begin() const {
|
||||
return entries_.cbegin();
|
||||
}
|
||||
|
||||
iterator end() const {
|
||||
return entries_.cend();
|
||||
}
|
||||
|
||||
private:
|
||||
SortedVector(PMR_NS::vector<DocId>&& v) : entries_{std::move(v)} {
|
||||
}
|
||||
|
||||
PMR_NS::vector<DocId> entries_;
|
||||
};
|
||||
|
||||
} // namespace dfly::search
|
123
src/core/search/block_list_test.cc
Normal file
123
src/core/search/block_list_test.cc
Normal file
|
@ -0,0 +1,123 @@
|
|||
// Copyright 2023, DragonflyDB authors. All rights reserved.
|
||||
// See LICENSE for licensing terms.
|
||||
//
|
||||
|
||||
#include "core/search/block_list.h"
|
||||
|
||||
#include <absl/container/btree_set.h>
|
||||
#include <gmock/gmock.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <set>
|
||||
|
||||
#include "base/gtest.h"
|
||||
#include "base/logging.h"
|
||||
|
||||
namespace dfly::search {
|
||||
|
||||
using namespace std;
|
||||
|
||||
template <typename C> class BlockListTest : public testing::Test {
|
||||
public:
|
||||
auto Make() {
|
||||
// Create list with small block size to test blocking mechanism more extensively
|
||||
return BlockList<C>{PMR_NS::get_default_resource(), 10};
|
||||
}
|
||||
};
|
||||
|
||||
using ContainerTypes = ::testing::Types<CompressedSortedSet, SortedVector>;
|
||||
TYPED_TEST_SUITE(BlockListTest, ContainerTypes);
|
||||
|
||||
TYPED_TEST(BlockListTest, LoopMidInsertErase) {
|
||||
const size_t kNumElements = 50;
|
||||
auto list = this->Make();
|
||||
|
||||
for (size_t i = 0; i < kNumElements / 2; i++) {
|
||||
list.Insert(i);
|
||||
list.Insert(i + kNumElements / 2);
|
||||
}
|
||||
|
||||
vector<int> out(list.begin(), list.end());
|
||||
ASSERT_EQ(list.size(), kNumElements);
|
||||
ASSERT_EQ(out.size(), kNumElements);
|
||||
for (size_t i = 0; i < kNumElements; i++)
|
||||
ASSERT_EQ(out[i], i);
|
||||
|
||||
for (size_t i = 0; i < kNumElements / 2; i++) {
|
||||
list.Remove(i);
|
||||
list.Remove(i + kNumElements / 2);
|
||||
}
|
||||
|
||||
out = {list.begin(), list.end()};
|
||||
EXPECT_EQ(out.size(), 0u);
|
||||
}
|
||||
|
||||
TYPED_TEST(BlockListTest, InsertReverseRemoveSteps) {
|
||||
const size_t kNumElements = 1000;
|
||||
auto list = this->Make();
|
||||
|
||||
for (size_t i = 0; i < kNumElements; i++) {
|
||||
list.Insert(kNumElements - i - 1);
|
||||
}
|
||||
|
||||
for (size_t deleted_pref = 0; deleted_pref < 10; deleted_pref++) {
|
||||
vector<DocId> out{list.begin(), list.end()};
|
||||
reverse(out.begin(), out.end());
|
||||
|
||||
EXPECT_EQ(out.size(), kNumElements / 10 * (10 - deleted_pref));
|
||||
for (size_t i = 0; i < kNumElements; i++) {
|
||||
if (i % 10 >= deleted_pref) {
|
||||
EXPECT_EQ(out.back(), DocId(i));
|
||||
out.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < kNumElements; i++) {
|
||||
if (i % 10 == deleted_pref)
|
||||
list.Remove(i);
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_EQ(list.size(), 0u);
|
||||
}
|
||||
|
||||
TYPED_TEST(BlockListTest, RandomNumbers) {
|
||||
const size_t kNumIterations = 1'000;
|
||||
auto list = this->Make();
|
||||
std::set<DocId> list_copy;
|
||||
|
||||
for (size_t i = 0; i < kNumIterations; i++) {
|
||||
if (list_copy.size() > 100 && rand() % 5 == 0) {
|
||||
auto it = list_copy.begin();
|
||||
std::advance(it, rand() % list_copy.size());
|
||||
list.Remove(*it);
|
||||
list_copy.erase(it);
|
||||
} else {
|
||||
DocId t = rand() % 1'000'000;
|
||||
list.Insert(t);
|
||||
list_copy.insert(t);
|
||||
}
|
||||
|
||||
ASSERT_TRUE(std::equal(list.begin(), list.end(), list_copy.begin(), list_copy.end()));
|
||||
}
|
||||
}
|
||||
|
||||
static void BM_Erase90PctTail(benchmark::State& state) {
|
||||
BlockList<CompressedSortedSet> bl{PMR_NS::get_default_resource()};
|
||||
|
||||
unsigned size = state.range(0);
|
||||
for (size_t i = 0; i < size; i++)
|
||||
bl.Insert(i);
|
||||
|
||||
size_t base = size / 10;
|
||||
size_t i = 0;
|
||||
while (state.KeepRunning()) {
|
||||
benchmark::DoNotOptimize(bl.Remove(base + i));
|
||||
i = (i + 1) % (size * 9 / 10);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK(BM_Erase90PctTail)->Args({100'000});
|
||||
|
||||
} // namespace dfly::search
|
|
@ -97,27 +97,27 @@ CompressedSortedSet::EntryLocation CompressedSortedSet::LowerBound(IntType value
|
|||
// needs to be inserted. Then it computes the differences dif1 = V - A and diff2 = B - V that need
|
||||
// to be stored to encode the triple A V B. Those are stored where diff0 = B - A was previously
|
||||
// stored, possibly extending the vector
|
||||
void CompressedSortedSet::Insert(IntType value) {
|
||||
bool CompressedSortedSet::Insert(IntType value) {
|
||||
if (tail_value_ && *tail_value_ == value)
|
||||
return;
|
||||
return false;
|
||||
|
||||
if (tail_value_ && value > *tail_value_) {
|
||||
PushBackDiff(value - *tail_value_);
|
||||
tail_value_ = value;
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
auto bound = LowerBound(value);
|
||||
|
||||
// At least one element was read and it's equal to value: return to avoid duplicate
|
||||
if (bound.value == value && !bound.diff_span.empty())
|
||||
return;
|
||||
return false;
|
||||
|
||||
// Value is bigger than any other (or list is empty): append required diff at the end
|
||||
if (value > bound.value || bound.diff_span.empty()) {
|
||||
PushBackDiff(value - bound.value);
|
||||
tail_value_ = value;
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
size_++;
|
||||
|
@ -141,17 +141,19 @@ void CompressedSortedSet::Insert(IntType value) {
|
|||
// Now overwrite diff0 and 0s with the two new differences
|
||||
copy(diff1_span.begin(), diff1_span.end(), diffs_.begin() + diff_offset);
|
||||
copy(diff2_span.begin(), diff2_span.end(), diffs_.begin() + diff_offset + diff1_span.size());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Remove has linear complexity. It tries to find the element V and its neighbors A and B,
|
||||
// which are encoded as diff1 = V - A and diff2 = B - V. Adjacently stored diff1 and diff2
|
||||
// need to be replaced with diff3 = diff1 + diff2s
|
||||
void CompressedSortedSet::Remove(IntType value) {
|
||||
bool CompressedSortedSet::Remove(IntType value) {
|
||||
auto bound = LowerBound(value);
|
||||
|
||||
// Nothing was read or the element was not found
|
||||
if (bound.diff_span.empty() || bound.value != value)
|
||||
return;
|
||||
return false;
|
||||
|
||||
// We're removing below unconditionally
|
||||
size_--;
|
||||
|
@ -166,7 +168,7 @@ void CompressedSortedSet::Remove(IntType value) {
|
|||
tail_value_ = bound.prev_value;
|
||||
if (diffs_.empty())
|
||||
tail_value_ = nullopt;
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Now the list certainly contains a succeeding element B > V and possibly A < V (or 0)
|
||||
|
@ -185,6 +187,8 @@ void CompressedSortedSet::Remove(IntType value) {
|
|||
|
||||
// Overwrite diff1/diff2 with new diff3
|
||||
copy(diff3_buf.begin(), diff3_buf.end(), diffs_.begin() + diff_offset);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t CompressedSortedSet::Size() const {
|
||||
|
@ -195,6 +199,35 @@ size_t CompressedSortedSet::ByteSize() const {
|
|||
return diffs_.size();
|
||||
}
|
||||
|
||||
void CompressedSortedSet::Merge(CompressedSortedSet&& other) {
|
||||
// Quadratic compexity in theory, but in practice used only to merge with larger values.
|
||||
// Tail insert optimization makes it linear
|
||||
for (int v : other)
|
||||
Insert(v);
|
||||
}
|
||||
|
||||
std::pair<CompressedSortedSet, CompressedSortedSet> CompressedSortedSet::Split() && {
|
||||
DCHECK_GT(Size(), 5u);
|
||||
|
||||
CompressedSortedSet second(diffs_.get_allocator().resource());
|
||||
|
||||
// Move iterator to middle position and save size of diffs tail
|
||||
auto it = begin();
|
||||
std::advance(it, size_ / 2);
|
||||
size_t keep_bytes = it.last_read_.data() - diffs_.data();
|
||||
|
||||
// Copy second half into second set
|
||||
for (; it != end(); ++it)
|
||||
second.Insert(*it);
|
||||
|
||||
// Erase diffs tail
|
||||
diffs_.resize(keep_bytes);
|
||||
tail_value_ = std::nullopt;
|
||||
size_ -= second.Size();
|
||||
|
||||
return std::make_pair(std::move(*this), std::move(second));
|
||||
}
|
||||
|
||||
// The leftmost three bits of the first byte store the number of additional bytes. All following
|
||||
// bits store the number itself.
|
||||
absl::Span<uint8_t> CompressedSortedSet::WriteVarLen(IntType value, absl::Span<uint8_t> buf) {
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include "base/logging.h"
|
||||
#include "base/pmr/memory_resource.h"
|
||||
#include "core/search/base.h"
|
||||
|
||||
|
@ -48,7 +49,7 @@ class CompressedSortedSet {
|
|||
absl::Span<const uint8_t> diffs_{};
|
||||
};
|
||||
|
||||
friend struct Iterator;
|
||||
using iterator = ConstIterator;
|
||||
|
||||
public:
|
||||
explicit CompressedSortedSet(PMR_NS::memory_resource* mr);
|
||||
|
@ -56,16 +57,17 @@ class CompressedSortedSet {
|
|||
ConstIterator begin() const;
|
||||
ConstIterator end() const;
|
||||
|
||||
void Insert(IntType value); // Insert arbitrary element, needs to scan whole list
|
||||
void Remove(IntType value); // Remove arbitrary element, needs to scan whole list
|
||||
bool Insert(IntType value); // Insert arbitrary element, needs to scan whole list
|
||||
bool Remove(IntType value); // Remove arbitrary element, needs to scan whole list
|
||||
|
||||
size_t Size() const;
|
||||
size_t ByteSize() const;
|
||||
|
||||
// To use transparently in templates together with stl containers
|
||||
size_t size() const {
|
||||
return Size();
|
||||
}
|
||||
// Add all values from other
|
||||
void Merge(CompressedSortedSet&& other);
|
||||
|
||||
// Split into two equally sized halves
|
||||
std::pair<CompressedSortedSet, CompressedSortedSet> Split() &&;
|
||||
|
||||
private:
|
||||
struct EntryLocation {
|
||||
|
@ -90,6 +92,7 @@ class CompressedSortedSet {
|
|||
|
||||
private:
|
||||
uint32_t size_{0};
|
||||
IntType head_value_{0};
|
||||
std::optional<IntType> tail_value_{};
|
||||
std::vector<uint8_t, PMR_NS::polymorphic_allocator<uint8_t>> diffs_;
|
||||
};
|
||||
|
|
|
@ -4,10 +4,13 @@
|
|||
|
||||
#include "core/search/compressed_sorted_set.h"
|
||||
|
||||
#include <absl/container/btree_set.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "base/gtest.h"
|
||||
#include "base/logging.h"
|
||||
#include "core/bptree_set.h"
|
||||
|
||||
namespace dfly::search {
|
||||
|
||||
|
|
|
@ -88,10 +88,12 @@ vector<DocId> NumericIndex::Range(double l, double r) const {
|
|||
return out;
|
||||
}
|
||||
|
||||
BaseStringIndex::BaseStringIndex(PMR_NS::memory_resource* mr) : entries_{mr} {
|
||||
template <typename C>
|
||||
BaseStringIndex<C>::BaseStringIndex(PMR_NS::memory_resource* mr) : entries_{mr} {
|
||||
}
|
||||
|
||||
const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const {
|
||||
template <typename C>
|
||||
const typename BaseStringIndex<C>::Container* BaseStringIndex<C>::Matching(string_view str) const {
|
||||
str = absl::StripAsciiWhitespace(str);
|
||||
|
||||
string word;
|
||||
|
@ -104,12 +106,14 @@ const CompressedSortedSet* BaseStringIndex::Matching(string_view str) const {
|
|||
return (it != entries_.end()) ? &it->second : nullptr;
|
||||
}
|
||||
|
||||
CompressedSortedSet* BaseStringIndex::GetOrCreate(string_view word) {
|
||||
template <typename C>
|
||||
typename BaseStringIndex<C>::Container* BaseStringIndex<C>::GetOrCreate(string_view word) {
|
||||
auto* mr = entries_.get_allocator().resource();
|
||||
return &entries_.try_emplace(PMR_NS::string{word, mr}, mr).first->second;
|
||||
return &entries_.try_emplace(PMR_NS::string{word, mr}, mr, 1000 /* block size */).first->second;
|
||||
}
|
||||
|
||||
void BaseStringIndex::Add(DocId id, DocumentAccessor* doc, string_view field) {
|
||||
template <typename C>
|
||||
void BaseStringIndex<C>::Add(DocId id, DocumentAccessor* doc, string_view field) {
|
||||
absl::flat_hash_set<std::string> tokens;
|
||||
for (string_view str : doc->GetStrings(field))
|
||||
tokens.merge(Tokenize(str));
|
||||
|
@ -118,7 +122,8 @@ void BaseStringIndex::Add(DocId id, DocumentAccessor* doc, string_view field) {
|
|||
GetOrCreate(token)->Insert(id);
|
||||
}
|
||||
|
||||
void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field) {
|
||||
template <typename C>
|
||||
void BaseStringIndex<C>::Remove(DocId id, DocumentAccessor* doc, string_view field) {
|
||||
absl::flat_hash_set<std::string> tokens;
|
||||
for (string_view str : doc->GetStrings(field))
|
||||
tokens.merge(Tokenize(str));
|
||||
|
@ -134,6 +139,9 @@ void BaseStringIndex::Remove(DocId id, DocumentAccessor* doc, string_view field)
|
|||
}
|
||||
}
|
||||
|
||||
template struct BaseStringIndex<CompressedSortedSet>;
|
||||
template struct BaseStringIndex<SortedVector>;
|
||||
|
||||
absl::flat_hash_set<std::string> TextIndex::Tokenize(std::string_view value) const {
|
||||
return TokenizeWords(value);
|
||||
}
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
|
||||
#include "base/pmr/memory_resource.h"
|
||||
#include "core/search/base.h"
|
||||
#include "core/search/block_list.h"
|
||||
#include "core/search/compressed_sorted_set.h"
|
||||
|
||||
// TODO: move core field definitions out of big header
|
||||
|
@ -36,7 +37,9 @@ struct NumericIndex : public BaseIndex {
|
|||
};
|
||||
|
||||
// Base index for string based indices.
|
||||
struct BaseStringIndex : public BaseIndex {
|
||||
template <typename C> struct BaseStringIndex : public BaseIndex {
|
||||
using Container = BlockList<C>;
|
||||
|
||||
BaseStringIndex(PMR_NS::memory_resource* mr);
|
||||
|
||||
void Add(DocId id, DocumentAccessor* doc, std::string_view field) override;
|
||||
|
@ -46,10 +49,10 @@ struct BaseStringIndex : public BaseIndex {
|
|||
virtual absl::flat_hash_set<std::string> Tokenize(std::string_view value) const = 0;
|
||||
|
||||
// Pointer is valid as long as index is not mutated. Nullptr if not found
|
||||
const CompressedSortedSet* Matching(std::string_view str) const;
|
||||
const Container* Matching(std::string_view str) const;
|
||||
|
||||
protected:
|
||||
CompressedSortedSet* GetOrCreate(std::string_view word);
|
||||
Container* GetOrCreate(std::string_view word);
|
||||
|
||||
struct PmrEqual {
|
||||
using is_transparent = void;
|
||||
|
@ -71,14 +74,14 @@ struct BaseStringIndex : public BaseIndex {
|
|||
}
|
||||
};
|
||||
|
||||
absl::flat_hash_map<PMR_NS::string, CompressedSortedSet, PmrHash, PmrEqual,
|
||||
PMR_NS::polymorphic_allocator<std::pair<PMR_NS::string, CompressedSortedSet>>>
|
||||
absl::flat_hash_map<PMR_NS::string, Container, PmrHash, PmrEqual,
|
||||
PMR_NS::polymorphic_allocator<std::pair<PMR_NS::string, Container>>>
|
||||
entries_;
|
||||
};
|
||||
|
||||
// Index for text fields.
|
||||
// Hashmap based lookup per word.
|
||||
struct TextIndex : public BaseStringIndex {
|
||||
struct TextIndex : public BaseStringIndex<CompressedSortedSet> {
|
||||
TextIndex(PMR_NS::memory_resource* mr) : BaseStringIndex(mr) {
|
||||
}
|
||||
|
||||
|
@ -87,7 +90,7 @@ struct TextIndex : public BaseStringIndex {
|
|||
|
||||
// Index for text fields.
|
||||
// Hashmap based lookup per word.
|
||||
struct TagIndex : public BaseStringIndex {
|
||||
struct TagIndex : public BaseStringIndex<SortedVector> {
|
||||
TagIndex(PMR_NS::memory_resource* mr) : BaseStringIndex(mr) {
|
||||
}
|
||||
|
||||
|
|
|
@ -45,20 +45,18 @@ AstExpr ParseQuery(std::string_view query, const QueryParams* params) {
|
|||
// Represents an either owned or non-owned result set that can be accessed transparently.
|
||||
struct IndexResult {
|
||||
using DocVec = vector<DocId>;
|
||||
using BorrowedView = variant<const DocVec*, const CompressedSortedSet*>;
|
||||
using BorrowedView =
|
||||
variant<const DocVec*, const BlockList<CompressedSortedSet>*, const BlockList<SortedVector>*>;
|
||||
|
||||
IndexResult() : value_{DocVec{}} {
|
||||
}
|
||||
|
||||
IndexResult(const CompressedSortedSet* css) : value_{css} {
|
||||
if (css == nullptr)
|
||||
value_ = DocVec{};
|
||||
}
|
||||
|
||||
IndexResult(DocVec&& dv) : value_{std::move(dv)} {
|
||||
}
|
||||
|
||||
IndexResult(const DocVec* dv) : value_{dv} {
|
||||
template <typename C> IndexResult(const C* container = nullptr) : value_{container} {
|
||||
if (container == nullptr)
|
||||
value_ = DocVec{};
|
||||
}
|
||||
|
||||
size_t Size() const {
|
||||
|
@ -108,7 +106,9 @@ struct IndexResult {
|
|||
}
|
||||
|
||||
private:
|
||||
variant<DocVec /*owned*/, const CompressedSortedSet*, const DocVec*> value_;
|
||||
variant<DocVec /*owned*/, const DocVec*, const BlockList<CompressedSortedSet>*,
|
||||
const BlockList<SortedVector>*>
|
||||
value_;
|
||||
};
|
||||
|
||||
struct ProfileBuilder {
|
||||
|
|
Loading…
Reference in a new issue