1
0
Fork 0
mirror of https://github.com/dragonflydb/dragonfly.git synced 2024-12-14 11:58:02 +00:00

feat: introduce simd algorithm for bitpacking (#568)

My benchmark shows a x3.5 improvement when compressing a 1KB string.

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2022-12-17 19:22:40 +02:00 committed by GitHub
parent adc89c7592
commit bcafd7e25d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 331 additions and 142 deletions

View file

@ -77,6 +77,6 @@ jobs:
ccache --show-stats
echo Run ctest -V -L DFLY
#GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1
GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=2,snapshot=2 ctest -V -L DFLY
GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=1,snapshot=1 ctest -V -L DFLY
./dragonfly_test --mem_defrag_threshold=0.05 # trying to catch issue with defrag
# GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1 CTEST_OUTPUT_ON_FAILURE=1 ninja server/test

View file

@ -1,6 +1,8 @@
add_library(dfly_core compact_object.cc dragonfly_core.cc extent_tree.cc
external_alloc.cc interpreter.cc json_object.cc mi_memory_resource.cc
segment_allocator.cc small_string.cc tx_queue.cc dense_set.cc string_set.cc)
segment_allocator.cc small_string.cc tx_queue.cc dense_set.cc string_set.cc
detail/bitpacking.cc)
cxx_link(dfly_core base absl::flat_hash_map absl::str_format redis_lib TRDP::lua lua_modules
Boost::fiber TRDP::jsoncons crypto)

View file

@ -23,19 +23,15 @@ extern "C" {
#include "base/flags.h"
#include "base/logging.h"
#include "base/pod_array.h"
#include "core/detail/bitpacking.h"
#include "core/string_set.h"
#if defined(__aarch64__)
#include "base/sse2neon.h"
#else
#include <emmintrin.h>
#endif
ABSL_FLAG(bool, use_set2, true, "If true use DenseSet for an optimized set data structure");
namespace dfly {
using namespace std;
using absl::GetFlag;
using detail::binpacked_len;
namespace {
@ -154,35 +150,6 @@ inline void FreeObjStream(void* ptr) {
freeStream((stream*)ptr);
}
// Daniel Lemire's function validate_ascii_fast() - under Apache/MIT license.
// See https://github.com/lemire/fastvalidate-utf-8/
// The function returns true (1) if all chars passed in src are
// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
bool validate_ascii_fast(const char* src, size_t len) {
size_t i = 0;
__m128i has_error = _mm_setzero_si128();
if (len >= 16) {
for (; i <= len - 16; i += 16) {
__m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i));
has_error = _mm_or_si128(has_error, current_bytes);
}
}
int error_mask = _mm_movemask_epi8(has_error);
char tail_has_error = 0;
for (; i < len; i++) {
tail_has_error |= src[i];
}
error_mask |= (tail_has_error & 0x80);
return !error_mask;
}
// maps ascii len to 7-bit packed length. Each 8 bytes are converted to 7 bytes.
inline constexpr size_t binpacked_len(size_t ascii_len) {
return (ascii_len * 7 + 7) / 8; /* rounded up */
}
// converts 7-bit packed length back to ascii length. Note that this conversion
// is not accurate since it maps 7 bytes to 8 bytes (rounds up), while we may have
// 7 byte strings converted to 7 byte as well.
@ -428,91 +395,6 @@ void RobjWrapper::MakeInnerRoom(size_t current_cap, size_t desired, pmr::memory_
inner_obj_ = newp;
}
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC push_options
#pragma GCC optimize("Ofast")
#endif
// len must be at least 16
void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
const char* end = ascii + len;
unsigned i = 0;
while (ascii + 8 <= end) {
for (i = 0; i < 7; ++i) {
*bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
++ascii;
}
++ascii;
}
// epilog - we do not pack since we have less than 8 bytes.
while (ascii < end) {
*bin++ = *ascii++;
}
}
// unpacks 8->7 encoded blob back to ascii.
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
// the source buffer.
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
// left than we can unpack inplace.
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
constexpr uint8_t kM = 0x7F;
uint8_t p = 0;
unsigned i = 0;
while (ascii_len >= 8) {
for (i = 0; i < 7; ++i) {
uint8_t src = *bin; // keep on stack in case we unpack inplace.
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
p = src;
++bin;
}
ascii_len -= 8;
*ascii++ = p >> 1;
}
DCHECK_LT(ascii_len, 8u);
for (i = 0; i < ascii_len; ++i) {
*ascii++ = *bin++;
}
}
// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
unsigned i = 0;
bool res = true;
const char* end = ascii + ascii_len;
while (ascii + 8 <= end) {
for (i = 0; i < 7; ++i) {
uint8_t conv = (ascii[0] >> i) | (ascii[1] << (7 - i));
res &= (conv == *packed);
++ascii;
++packed;
}
if (!res)
return false;
++ascii;
}
while (ascii < end) {
if (*ascii++ != *packed++) {
return false;
}
}
return true;
}
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC pop_options
#endif
} // namespace detail
using namespace std;
@ -777,7 +659,7 @@ void CompactObj::SetString(std::string_view str) {
DCHECK_GT(str.size(), kInlineLen);
string_view encoded = str;
bool is_ascii = kUseAsciiEncoding && validate_ascii_fast(str.data(), str.size());
bool is_ascii = kUseAsciiEncoding && detail::validate_ascii_fast(str.data(), str.size());
if (is_ascii) {
size_t encode_len = binpacked_len(str.size());
@ -792,7 +674,7 @@ void CompactObj::SetString(std::string_view str) {
}
tl.tmp_buf.resize(encode_len);
detail::ascii_pack(str.data(), str.size(), tl.tmp_buf.data());
detail::ascii_pack_simd(str.data(), str.size(), tl.tmp_buf.data());
encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), encode_len};
if (encoded.size() <= kInlineLen) {
@ -1125,7 +1007,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
if (u_.r_obj.Size() != encode_len)
return false;
if (!validate_ascii_fast(sv.data(), sv.size()))
if (!detail::validate_ascii_fast(sv.data(), sv.size()))
return false;
return detail::compare_packed(to_byte(u_.r_obj.inner_obj()), sv.data(), sv.size());
@ -1139,7 +1021,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
if (u_.small_str.size() != encode_len)
return false;
if (!validate_ascii_fast(sv.data(), sv.size()))
if (!detail::validate_ascii_fast(sv.data(), sv.size()))
return false;
// We need to compare an unpacked sv with 2 packed parts.

View file

@ -76,16 +76,6 @@ class RobjWrapper {
} __attribute__((packed));
// unpacks 8->7 encoded blob back to ascii.
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
// the source buffer.
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
// left than we can unpack inplace.
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);
// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
void ascii_pack(const char* ascii, size_t len, uint8_t* bin);
} // namespace detail
class CompactObj {

View file

@ -12,6 +12,7 @@
#include "base/gtest.h"
#include "base/logging.h"
#include "core/detail/bitpacking.h"
#include "core/flat_set.h"
#include "core/json_object.h"
#include "core/mi_memory_resource.h"
@ -189,13 +190,24 @@ TEST_F(CompactObjectTest, AsciiUtil) {
std::string_view data{"aaaaaabb"};
uint8_t buf[32];
char ascii2[] = "xxxxxxxxxxxxxx";
detail::ascii_pack(data.data(), 7, buf);
detail::ascii_unpack(buf, 7, ascii2);
char outbuf[32] = "xxxxxxxxxxxxxx";
detail::ascii_pack_simd(data.data(), 7, buf);
detail::ascii_unpack(buf, 7, outbuf);
ASSERT_EQ('x', ascii2[7]) << ascii2;
std::string_view actual{ascii2, 7};
ASSERT_EQ('x', outbuf[7]) << outbuf;
std::string_view actual{outbuf, 7};
ASSERT_EQ(data.substr(0, 7), actual);
string data3;
for (unsigned i = 0; i < 97; ++i) {
data3.append("12345678910");
}
string act_str(data3.size(), 'y');
std::vector<uint8_t> binvec(detail::binpacked_len(data3.size()));
detail::ascii_pack_simd(data3.data(), data3.size(), binvec.data());
detail::ascii_unpack(binvec.data(), data3.size(), act_str.data());
ASSERT_EQ(data3, act_str);
}
TEST_F(CompactObjectTest, IntSet) {
@ -453,4 +465,62 @@ TEST_F(CompactObjectTest, JsonTypeWithPathTest) {
}
}
static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
const char* end = ascii + len;
unsigned i = 0;
while (ascii + 8 <= end) {
for (i = 0; i < 7; ++i) {
*bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
++ascii;
}
++ascii;
}
// epilog - we do not pack since we have less than 8 bytes.
while (ascii < end) {
*bin++ = *ascii++;
}
}
static void BM_PackNaive(benchmark::State& state) {
string val(1024, 'a');
uint8_t buf[1024];
while (state.KeepRunning()) {
ascii_pack_naive(val.data(), val.size(), buf);
}
}
BENCHMARK(BM_PackNaive);
static void BM_Pack(benchmark::State& state) {
string val(1024, 'a');
uint8_t buf[1024];
while (state.KeepRunning()) {
detail::ascii_pack(val.data(), val.size(), buf);
}
}
BENCHMARK(BM_Pack);
static void BM_Pack2(benchmark::State& state) {
string val(1024, 'a');
uint8_t buf[1024];
while (state.KeepRunning()) {
detail::ascii_pack(val.data(), val.size(), buf);
}
}
BENCHMARK(BM_Pack2);
static void BM_PackSimd(benchmark::State& state) {
string val(1024, 'a');
uint8_t buf[1024];
while (state.KeepRunning()) {
detail::ascii_pack_simd(val.data(), val.size(), buf);
}
}
BENCHMARK(BM_PackSimd);
} // namespace dfly

View file

@ -0,0 +1,209 @@
// Copyright 2022, Roman Gershman. All rights reserved.
// See LICENSE for licensing terms.
//
#include "src/core/detail/bitpacking.h"
#include "base/logging.h"
#if defined(__aarch64__)
#include "base/sse2neon.h"
#else
#include <emmintrin.h>
#include <tmmintrin.h>
#endif
#include <absl/base/internal/endian.h>
namespace dfly {
namespace detail {
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC push_options
#pragma GCC optimize("Ofast")
#endif
static inline uint64_t Compress8x7bit(uint64_t x) {
x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);
return x;
}
// Daniel Lemire's function validate_ascii_fast() - under Apache/MIT license.
// See https://github.com/lemire/fastvalidate-utf-8/
// The function returns true (1) if all chars passed in src are
// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
bool validate_ascii_fast(const char* src, size_t len) {
size_t i = 0;
__m128i has_error = _mm_setzero_si128();
if (len >= 16) {
for (; i <= len - 16; i += 16) {
__m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i));
has_error = _mm_or_si128(has_error, current_bytes);
}
}
int error_mask = _mm_movemask_epi8(has_error);
char tail_has_error = 0;
for (; i < len; i++) {
tail_has_error |= src[i];
}
error_mask |= (tail_has_error & 0x80);
return !error_mask;
}
// len must be at least 16
void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
uint64_t val;
const char* end = ascii + len;
while (ascii + 8 <= end) {
val = absl::little_endian::Load64(ascii);
uint64_t dest = (val & 0xFF);
for (unsigned i = 1; i <= 7; ++i) {
val >>= 1;
dest |= (val & (0x7FUL << 7 * i));
}
memcpy(bin, &dest, 7);
bin += 7;
ascii += 8;
}
// epilog - we do not pack since we have less than 8 bytes.
while (ascii < end) {
*bin++ = *ascii++;
}
}
void ascii_pack2(const char* ascii, size_t len, uint8_t* bin) {
uint64_t val;
const char* end = ascii + len;
while (ascii + 8 <= end) {
val = absl::little_endian::Load64(ascii);
val = Compress8x7bit(val);
memcpy(bin, &val, 7);
bin += 7;
ascii += 8;
}
// epilog - we do not pack since we have less than 8 bytes.
while (ascii < end) {
*bin++ = *ascii++;
}
}
// The algo - do in parallel what ascii_pack does on two uint64_t integers
void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
__m128i val;
// I leave out 16 bytes in addition to 16 that we load in the loop
// because we store into bin full 16 bytes instead of 14. To prevent data
// overwrite we finish loop one iteration earlier.
const char* end = ascii + len - 32;
// Skips 8th byte (indexc 7) in the lower 8-byte part.
const __m128i control = _mm_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0);
__m128i rpart, lpart;
// Based on the question I asked here: https://stackoverflow.com/q/74831843/2280111
while (ascii <= end) {
val = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ascii));
/*
x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);
*/
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x007F007F007F007F));
lpart = _mm_and_si128(val, _mm_set1_epi64x(0x7F007F007F007F00));
val = _mm_or_si128(_mm_srli_epi64(lpart, 1), rpart);
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x00003FFF00003FFF));
lpart = _mm_and_si128(val, _mm_set1_epi64x(0x3FFF00003FFF0000));
val = _mm_or_si128(_mm_srli_epi64(lpart, 2), rpart);
rpart = _mm_and_si128(val, _mm_set1_epi64x(0x000000000FFFFFFF));
lpart = _mm_and_si128(val, _mm_set1_epi64x(0x0FFFFFFF00000000));
val = _mm_or_si128(_mm_srli_epi64(lpart, 4), rpart);
val = _mm_shuffle_epi8(val, control);
_mm_storeu_si128(reinterpret_cast<__m128i*>(bin), val);
bin += 14;
ascii += 16;
}
end += 32; // Bring back end.
DCHECK(ascii < end);
ascii_pack(ascii, end - ascii, bin);
}
// unpacks 8->7 encoded blob back to ascii.
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
// the source buffer.
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
// left than we can unpack inplace.
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
constexpr uint8_t kM = 0x7F;
uint8_t p = 0;
unsigned i = 0;
while (ascii_len >= 8) {
for (i = 0; i < 7; ++i) {
uint8_t src = *bin; // keep on stack in case we unpack inplace.
*ascii++ = (p >> (8 - i)) | ((src << i) & kM);
p = src;
++bin;
}
ascii_len -= 8;
*ascii++ = p >> 1;
}
DCHECK_LT(ascii_len, 8u);
for (i = 0; i < ascii_len; ++i) {
*ascii++ = *bin++;
}
}
// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
unsigned i = 0;
bool res = true;
const char* end = ascii + ascii_len;
while (ascii + 8 <= end) {
for (i = 0; i < 7; ++i) {
uint8_t conv = (ascii[0] >> i) | (ascii[1] << (7 - i));
res &= (conv == *packed);
++ascii;
++packed;
}
if (!res)
return false;
++ascii;
}
while (ascii < end) {
if (*ascii++ != *packed++) {
return false;
}
}
return true;
}
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC pop_options
#endif
} // namespace detail
} // namespace dfly

View file

@ -0,0 +1,36 @@
// Copyright 2022, Roman Gershman. All rights reserved.
// See LICENSE for licensing terms.
//
#pragma once
#include <cstddef>
#include <cstdint>
namespace dfly {
namespace detail {
bool validate_ascii_fast(const char* src, size_t len);
// unpacks 8->7 encoded blob back to ascii.
// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
// the source buffer.
// however, if binary data is positioned on the right of the ascii buffer with empty space on the
// left than we can unpack inplace.
void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);
// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
void ascii_pack(const char* ascii, size_t len, uint8_t* bin);
void ascii_pack2(const char* ascii, size_t len, uint8_t* bin);
void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin);
bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len);
// maps ascii len to 7-bit packed length. Each 8 bytes are converted to 7 bytes.
inline constexpr size_t binpacked_len(size_t ascii_len) {
return (ascii_len * 7 + 7) / 8; /* rounded up */
}
} // namespace detail
} // namespace dfly

View file

@ -140,7 +140,7 @@ TEST_F(GenericFamilyTest, Rename) {
int64_t val = CheckedInt({"get", "x"});
ASSERT_EQ(kint64min, val); // does not exist
ASSERT_EQ(Run({"get", "b"}), x_val); // swapped.
ASSERT_EQ(x_val, Run({"get", "b"})); // swapped.
EXPECT_EQ(CheckedInt({"exists", "x", "b"}), 1);