feat: introduce simd algorithm for bitpacking (#568)

My benchmark shows a x3.5 improvement when compressing a 1KB string. Signed-off-by: Roman Gershman <roman@dragonflydb.io>
2024-12-14 11:58:02 +00:00 · 2022-12-17 19:22:40 +02:00 · 2022-12-17 19:22:40 +02:00 · bcafd7e25d
commit bcafd7e25d
parent adc89c7592
8 changed files with 331 additions and 142 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -77,6 +77,6 @@ jobs:
          ccache --show-stats
          echo Run ctest -V -L DFLY
          #GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1
-          GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=2,snapshot=2 ctest -V -L DFLY
+          GLOG_logtostderr=1 GLOG_vmodule=rdb_load=1,rdb_save=1,snapshot=1 ctest -V -L DFLY
          ./dragonfly_test  --mem_defrag_threshold=0.05 # trying to catch issue with defrag
          # GLOG_logtostderr=1 GLOG_vmodule=transaction=1,engine_shard_set=1 CTEST_OUTPUT_ON_FAILURE=1 ninja server/test
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -1,6 +1,8 @@
 add_library(dfly_core compact_object.cc dragonfly_core.cc extent_tree.cc
    external_alloc.cc interpreter.cc json_object.cc mi_memory_resource.cc
-    segment_allocator.cc small_string.cc tx_queue.cc dense_set.cc string_set.cc)
+    segment_allocator.cc small_string.cc tx_queue.cc dense_set.cc string_set.cc
+    detail/bitpacking.cc)
+
 cxx_link(dfly_core base absl::flat_hash_map absl::str_format redis_lib TRDP::lua lua_modules
    Boost::fiber TRDP::jsoncons crypto)

--- a/src/core/compact_object.cc
+++ b/src/core/compact_object.cc
@ -23,19 +23,15 @@ extern "C" {
 #include "base/flags.h"
 #include "base/logging.h"
 #include "base/pod_array.h"
+#include "core/detail/bitpacking.h"
 #include "core/string_set.h"

-#if defined(__aarch64__)
-#include "base/sse2neon.h"
-#else
-#include <emmintrin.h>
-#endif
-
 ABSL_FLAG(bool, use_set2, true, "If true use DenseSet for an optimized set data structure");

 namespace dfly {
 using namespace std;
 using absl::GetFlag;
+using detail::binpacked_len;

 namespace {

@ -154,35 +150,6 @@ inline void FreeObjStream(void* ptr) {
  freeStream((stream*)ptr);
 }

-// Daniel Lemire's function validate_ascii_fast() - under Apache/MIT license.
-// See https://github.com/lemire/fastvalidate-utf-8/
-// The function returns true (1) if all chars passed in src are
-// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
-bool validate_ascii_fast(const char* src, size_t len) {
-  size_t i = 0;
-  __m128i has_error = _mm_setzero_si128();
-  if (len >= 16) {
-    for (; i <= len - 16; i += 16) {
-      __m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i));
-      has_error = _mm_or_si128(has_error, current_bytes);
-    }
-  }
-  int error_mask = _mm_movemask_epi8(has_error);
-
-  char tail_has_error = 0;
-  for (; i < len; i++) {
-    tail_has_error |= src[i];
-  }
-  error_mask |= (tail_has_error & 0x80);
-
-  return !error_mask;
-}
-
-// maps ascii len to 7-bit packed length. Each 8 bytes are converted to 7 bytes.
-inline constexpr size_t binpacked_len(size_t ascii_len) {
-  return (ascii_len * 7 + 7) / 8; /* rounded up */
-}
-
 // converts 7-bit packed length back to ascii length. Note that this conversion
 // is not accurate since it maps 7 bytes to 8 bytes (rounds up), while we may have
 // 7 byte strings converted to 7 byte as well.
@ -428,91 +395,6 @@ void RobjWrapper::MakeInnerRoom(size_t current_cap, size_t desired, pmr::memory_
  inner_obj_ = newp;
 }

-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC push_options
-#pragma GCC optimize("Ofast")
-#endif
-
-// len must be at least 16
-void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
-  const char* end = ascii + len;
-
-  unsigned i = 0;
-  while (ascii + 8 <= end) {
-    for (i = 0; i < 7; ++i) {
-      *bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
-      ++ascii;
-    }
-    ++ascii;
-  }
-
-  // epilog - we do not pack since we have less than 8 bytes.
-  while (ascii < end) {
-    *bin++ = *ascii++;
-  }
-}
-
-// unpacks 8->7 encoded blob back to ascii.
-// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
-// the source buffer.
-// however, if binary data is positioned on the right of the ascii buffer with empty space on the
-// left than we can unpack inplace.
-void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
-  constexpr uint8_t kM = 0x7F;
-  uint8_t p = 0;
-  unsigned i = 0;
-
-  while (ascii_len >= 8) {
-    for (i = 0; i < 7; ++i) {
-      uint8_t src = *bin;  // keep on stack in case we unpack inplace.
-      *ascii++ = (p >> (8 - i)) | ((src << i) & kM);
-      p = src;
-      ++bin;
-    }
-
-    ascii_len -= 8;
-    *ascii++ = p >> 1;
-  }
-
-  DCHECK_LT(ascii_len, 8u);
-  for (i = 0; i < ascii_len; ++i) {
-    *ascii++ = *bin++;
-  }
-}
-
-// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
-bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
-  unsigned i = 0;
-  bool res = true;
-  const char* end = ascii + ascii_len;
-
-  while (ascii + 8 <= end) {
-    for (i = 0; i < 7; ++i) {
-      uint8_t conv = (ascii[0] >> i) | (ascii[1] << (7 - i));
-      res &= (conv == *packed);
-      ++ascii;
-      ++packed;
-    }
-
-    if (!res)
-      return false;
-
-    ++ascii;
-  }
-
-  while (ascii < end) {
-    if (*ascii++ != *packed++) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-#if defined(__GNUC__) && !defined(__clang__)
-#pragma GCC pop_options
-#endif
-
 }  // namespace detail

 using namespace std;
@ -777,7 +659,7 @@ void CompactObj::SetString(std::string_view str) {
  DCHECK_GT(str.size(), kInlineLen);

  string_view encoded = str;
-  bool is_ascii = kUseAsciiEncoding && validate_ascii_fast(str.data(), str.size());
+  bool is_ascii = kUseAsciiEncoding && detail::validate_ascii_fast(str.data(), str.size());

  if (is_ascii) {
    size_t encode_len = binpacked_len(str.size());
@ -792,7 +674,7 @@ void CompactObj::SetString(std::string_view str) {
    }

    tl.tmp_buf.resize(encode_len);
-    detail::ascii_pack(str.data(), str.size(), tl.tmp_buf.data());
+    detail::ascii_pack_simd(str.data(), str.size(), tl.tmp_buf.data());
    encoded = string_view{reinterpret_cast<char*>(tl.tmp_buf.data()), encode_len};

    if (encoded.size() <= kInlineLen) {
@ -1125,7 +1007,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
    if (u_.r_obj.Size() != encode_len)
      return false;

-    if (!validate_ascii_fast(sv.data(), sv.size()))
+    if (!detail::validate_ascii_fast(sv.data(), sv.size()))
      return false;

    return detail::compare_packed(to_byte(u_.r_obj.inner_obj()), sv.data(), sv.size());
@ -1139,7 +1021,7 @@ bool CompactObj::CmpEncoded(string_view sv) const {
    if (u_.small_str.size() != encode_len)
      return false;

-    if (!validate_ascii_fast(sv.data(), sv.size()))
+    if (!detail::validate_ascii_fast(sv.data(), sv.size()))
      return false;

    // We need to compare an unpacked sv with 2 packed parts.
--- a/src/core/compact_object.h
+++ b/src/core/compact_object.h
@ -76,16 +76,6 @@ class RobjWrapper {

 } __attribute__((packed));

-// unpacks 8->7 encoded blob back to ascii.
-// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
-// the source buffer.
-// however, if binary data is positioned on the right of the ascii buffer with empty space on the
-// left than we can unpack inplace.
-void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);
-
-// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
-void ascii_pack(const char* ascii, size_t len, uint8_t* bin);
-
 }  // namespace detail

 class CompactObj {
--- a/src/core/compact_object_test.cc
+++ b/src/core/compact_object_test.cc
@ -12,6 +12,7 @@

 #include "base/gtest.h"
 #include "base/logging.h"
+#include "core/detail/bitpacking.h"
 #include "core/flat_set.h"
 #include "core/json_object.h"
 #include "core/mi_memory_resource.h"
@ -189,13 +190,24 @@ TEST_F(CompactObjectTest, AsciiUtil) {
  std::string_view data{"aaaaaabb"};
  uint8_t buf[32];

-  char ascii2[] = "xxxxxxxxxxxxxx";
-  detail::ascii_pack(data.data(), 7, buf);
-  detail::ascii_unpack(buf, 7, ascii2);
+  char outbuf[32] = "xxxxxxxxxxxxxx";
+  detail::ascii_pack_simd(data.data(), 7, buf);
+  detail::ascii_unpack(buf, 7, outbuf);

-  ASSERT_EQ('x', ascii2[7]) << ascii2;
-  std::string_view actual{ascii2, 7};
+  ASSERT_EQ('x', outbuf[7]) << outbuf;
+  std::string_view actual{outbuf, 7};
  ASSERT_EQ(data.substr(0, 7), actual);
+
+  string data3;
+  for (unsigned i = 0; i < 97; ++i) {
+    data3.append("12345678910");
+  }
+  string act_str(data3.size(), 'y');
+  std::vector<uint8_t> binvec(detail::binpacked_len(data3.size()));
+  detail::ascii_pack_simd(data3.data(), data3.size(), binvec.data());
+  detail::ascii_unpack(binvec.data(), data3.size(), act_str.data());
+
+  ASSERT_EQ(data3, act_str);
 }

 TEST_F(CompactObjectTest, IntSet) {
@ -453,4 +465,62 @@ TEST_F(CompactObjectTest, JsonTypeWithPathTest) {
  }
 }

+static void ascii_pack_naive(const char* ascii, size_t len, uint8_t* bin) {
+  const char* end = ascii + len;
+
+  unsigned i = 0;
+  while (ascii + 8 <= end) {
+    for (i = 0; i < 7; ++i) {
+      *bin++ = (ascii[0] >> i) | (ascii[1] << (7 - i));
+      ++ascii;
+    }
+    ++ascii;
+  }
+
+  // epilog - we do not pack since we have less than 8 bytes.
+  while (ascii < end) {
+    *bin++ = *ascii++;
+  }
+}
+
+static void BM_PackNaive(benchmark::State& state) {
+  string val(1024, 'a');
+  uint8_t buf[1024];
+
+  while (state.KeepRunning()) {
+    ascii_pack_naive(val.data(), val.size(), buf);
+  }
+}
+BENCHMARK(BM_PackNaive);
+
+static void BM_Pack(benchmark::State& state) {
+  string val(1024, 'a');
+  uint8_t buf[1024];
+
+  while (state.KeepRunning()) {
+    detail::ascii_pack(val.data(), val.size(), buf);
+  }
+}
+BENCHMARK(BM_Pack);
+
+static void BM_Pack2(benchmark::State& state) {
+  string val(1024, 'a');
+  uint8_t buf[1024];
+
+  while (state.KeepRunning()) {
+    detail::ascii_pack(val.data(), val.size(), buf);
+  }
+}
+BENCHMARK(BM_Pack2);
+
+static void BM_PackSimd(benchmark::State& state) {
+  string val(1024, 'a');
+  uint8_t buf[1024];
+
+  while (state.KeepRunning()) {
+    detail::ascii_pack_simd(val.data(), val.size(), buf);
+  }
+}
+BENCHMARK(BM_PackSimd);
+
 }  // namespace dfly
--- a/src/core/detail/bitpacking.cc
+++ b/src/core/detail/bitpacking.cc
@ -0,0 +1,209 @@
+// Copyright 2022, Roman Gershman.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#include "src/core/detail/bitpacking.h"
+
+#include "base/logging.h"
+
+#if defined(__aarch64__)
+#include "base/sse2neon.h"
+#else
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#endif
+#include <absl/base/internal/endian.h>
+
+namespace dfly {
+
+namespace detail {
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC optimize("Ofast")
+#endif
+
+static inline uint64_t Compress8x7bit(uint64_t x) {
+  x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
+  x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
+  x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);
+
+  return x;
+}
+
+// Daniel Lemire's function validate_ascii_fast() - under Apache/MIT license.
+// See https://github.com/lemire/fastvalidate-utf-8/
+// The function returns true (1) if all chars passed in src are
+// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
+bool validate_ascii_fast(const char* src, size_t len) {
+  size_t i = 0;
+  __m128i has_error = _mm_setzero_si128();
+  if (len >= 16) {
+    for (; i <= len - 16; i += 16) {
+      __m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i));
+      has_error = _mm_or_si128(has_error, current_bytes);
+    }
+  }
+  int error_mask = _mm_movemask_epi8(has_error);
+
+  char tail_has_error = 0;
+  for (; i < len; i++) {
+    tail_has_error |= src[i];
+  }
+  error_mask |= (tail_has_error & 0x80);
+
+  return !error_mask;
+}
+
+// len must be at least 16
+void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
+  uint64_t val;
+  const char* end = ascii + len;
+
+  while (ascii + 8 <= end) {
+    val = absl::little_endian::Load64(ascii);
+    uint64_t dest = (val & 0xFF);
+    for (unsigned i = 1; i <= 7; ++i) {
+      val >>= 1;
+      dest |= (val & (0x7FUL << 7 * i));
+    }
+    memcpy(bin, &dest, 7);
+    bin += 7;
+    ascii += 8;
+  }
+
+  // epilog - we do not pack since we have less than 8 bytes.
+  while (ascii < end) {
+    *bin++ = *ascii++;
+  }
+}
+
+void ascii_pack2(const char* ascii, size_t len, uint8_t* bin) {
+  uint64_t val;
+  const char* end = ascii + len;
+
+  while (ascii + 8 <= end) {
+    val = absl::little_endian::Load64(ascii);
+    val = Compress8x7bit(val);
+    memcpy(bin, &val, 7);
+    bin += 7;
+    ascii += 8;
+  }
+
+  // epilog - we do not pack since we have less than 8 bytes.
+  while (ascii < end) {
+    *bin++ = *ascii++;
+  }
+}
+
+// The algo - do in parallel what ascii_pack does on two uint64_t integers
+void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
+  __m128i val;
+
+  // I leave out 16 bytes in addition to 16 that we load in the loop
+  // because we store into bin full 16 bytes instead of 14. To prevent data
+  // overwrite we finish loop one iteration earlier.
+  const char* end = ascii + len - 32;
+
+  // Skips 8th byte (indexc 7) in the lower 8-byte part.
+  const __m128i control = _mm_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0);
+
+  __m128i rpart, lpart;
+
+  // Based on the question I asked here: https://stackoverflow.com/q/74831843/2280111
+  while (ascii <= end) {
+    val = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ascii));
+
+    /*
+    x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
+    x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
+    x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);
+    */
+
+    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x007F007F007F007F));
+    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x7F007F007F007F00));
+    val = _mm_or_si128(_mm_srli_epi64(lpart, 1), rpart);
+
+    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x00003FFF00003FFF));
+    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x3FFF00003FFF0000));
+    val = _mm_or_si128(_mm_srli_epi64(lpart, 2), rpart);
+
+    rpart = _mm_and_si128(val, _mm_set1_epi64x(0x000000000FFFFFFF));
+    lpart = _mm_and_si128(val, _mm_set1_epi64x(0x0FFFFFFF00000000));
+    val = _mm_or_si128(_mm_srli_epi64(lpart, 4), rpart);
+
+    val = _mm_shuffle_epi8(val, control);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(bin), val);
+    bin += 14;
+    ascii += 16;
+  }
+
+  end += 32;  // Bring back end.
+  DCHECK(ascii < end);
+  ascii_pack(ascii, end - ascii, bin);
+}
+
+// unpacks 8->7 encoded blob back to ascii.
+// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
+// the source buffer.
+// however, if binary data is positioned on the right of the ascii buffer with empty space on the
+// left than we can unpack inplace.
+void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii) {
+  constexpr uint8_t kM = 0x7F;
+  uint8_t p = 0;
+  unsigned i = 0;
+
+  while (ascii_len >= 8) {
+    for (i = 0; i < 7; ++i) {
+      uint8_t src = *bin;  // keep on stack in case we unpack inplace.
+      *ascii++ = (p >> (8 - i)) | ((src << i) & kM);
+      p = src;
+      ++bin;
+    }
+
+    ascii_len -= 8;
+    *ascii++ = p >> 1;
+  }
+
+  DCHECK_LT(ascii_len, 8u);
+  for (i = 0; i < ascii_len; ++i) {
+    *ascii++ = *bin++;
+  }
+}
+
+// compares packed and unpacked strings. packed must be of length = binpacked_len(ascii_len).
+bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len) {
+  unsigned i = 0;
+  bool res = true;
+  const char* end = ascii + ascii_len;
+
+  while (ascii + 8 <= end) {
+    for (i = 0; i < 7; ++i) {
+      uint8_t conv = (ascii[0] >> i) | (ascii[1] << (7 - i));
+      res &= (conv == *packed);
+      ++ascii;
+      ++packed;
+    }
+
+    if (!res)
+      return false;
+
+    ++ascii;
+  }
+
+  while (ascii < end) {
+    if (*ascii++ != *packed++) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+}  // namespace detail
+
+}  // namespace dfly
--- a/src/core/detail/bitpacking.h
+++ b/src/core/detail/bitpacking.h
@ -0,0 +1,36 @@
+// Copyright 2022, Roman Gershman.  All rights reserved.
+// See LICENSE for licensing terms.
+//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace dfly {
+
+namespace detail {
+
+bool validate_ascii_fast(const char* src, size_t len);
+
+// unpacks 8->7 encoded blob back to ascii.
+// generally, we can not unpack inplace because ascii (dest) buffer is 8/7 bigger than
+// the source buffer.
+// however, if binary data is positioned on the right of the ascii buffer with empty space on the
+// left than we can unpack inplace.
+void ascii_unpack(const uint8_t* bin, size_t ascii_len, char* ascii);
+
+// packs ascii string (does not verify) into binary form saving 1 bit per byte on average (12.5%).
+void ascii_pack(const char* ascii, size_t len, uint8_t* bin);
+void ascii_pack2(const char* ascii, size_t len, uint8_t* bin);
+
+void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin);
+bool compare_packed(const uint8_t* packed, const char* ascii, size_t ascii_len);
+
+// maps ascii len to 7-bit packed length. Each 8 bytes are converted to 7 bytes.
+inline constexpr size_t binpacked_len(size_t ascii_len) {
+  return (ascii_len * 7 + 7) / 8; /* rounded up */
+}
+
+}  // namespace detail
+}  // namespace dfly
--- a/src/server/generic_family_test.cc
+++ b/src/server/generic_family_test.cc
@ -140,7 +140,7 @@ TEST_F(GenericFamilyTest, Rename) {
  int64_t val = CheckedInt({"get", "x"});
  ASSERT_EQ(kint64min, val);  // does not exist

-  ASSERT_EQ(Run({"get", "b"}), x_val);  // swapped.
+  ASSERT_EQ(x_val, Run({"get", "b"}));  // swapped.

  EXPECT_EQ(CheckedInt({"exists", "x", "b"}), 1);