chore: prefetch keys during transaction scheduling

To demonstrate the improvement, I run read only traffic on already prefilled datastore "debug populate 10000000 key 1000". The traffic consists of 100% miss rate in order to zoom in into the flow handled by this pr, which is - looking up for a key in the dashtable. For the same reason, I used pipelining - to reduce the impact of networking CPU on the server side, and to make the workload more intensive on memory. This improvement: 1. Reduces the running time by 12% (or increased the avg QPS by 13%) 2. p99 reduced by 12% from 1700 to 1500 usec. Credit for the idea: https://valkey.io/blog/unlock-one-million-rps/ Detailed runs at more detail: Before this change: ``` ~/projects/dragonfly/build-opt$ ./dfly_bench -p 6380 --qps=9500 --ratio=0:1 -h 10.142.15.215 -n 4000000 --key_prefix=k --proactor_threads=8 -c 10 Running 8 threads, sending 4000000 requests per each connection, or 320000000 requests overall At a rate of 9500 rps per connection, i.e. request every 105us Overall scheduled RPS: 760000 5s: 1.1% done, RPS(now/agg): 710271/710271, errs: 0, hitrate: 0.0%, clients: 80 done_min: 0.96%, done_max: 1.19%, p99_lat(us): 1786, max_pending: 11 10s: 2.2% done, RPS(now/agg): 703190/706730, errs: 0, hitrate: 0.0%, clients: 80 done_min: 1.90%, done_max: 2.38%, p99_lat(us): 1788, max_pending: 11 90s: 20.0% done, RPS(now/agg): 703373/711583, errs: 0, hitrate: 0.0%, clients: 80 done_min: 17.68%, done_max: 21.38%, p99_lat(us): 1778, max_pending: 11 345s: 76.3% done, RPS(now/agg): 734230/707276, errs: 0, hitrate: 0.0%, clients: 80 done_min: 68.83%, done_max: 81.94%, p99_lat(us): 1779, max_pending: 11 350s: 77.3% done, RPS(now/agg): 696489/707122, errs: 0, hitrate: 0.0%, clients: 80 done_min: 69.84%, done_max: 83.13%, p99_lat(us): 1778, max_pending: 11 450s: 97.3% done, RPS(now/agg): 400617/691734, errs: 0, hitrate: 0.0%, clients: 45 done_min: 89.85%, done_max: 100.00%, p99_lat(us): 1779, max_pending: 11 455s: 97.7% done, RPS(now/agg): 250114/686881, errs: 0, hitrate: 0.0%, clients: 24 done_min: 90.80%, done_max: 100.00%, p99_lat(us): 1780, max_pending: 11 460s: 98.0% done, RPS(now/agg): 179637/681368, errs: 0, hitrate: 0.0%, clients: 24 done_min: 91.76%, done_max: 100.00%, p99_lat(us): 1781, max_pending: 11 465s: 98.3% done, RPS(now/agg): 210018/676299, errs: 0, hitrate: 0.0%, clients: 24 done_min: 92.76%, done_max: 100.00%, p99_lat(us): 1781, max_pending: 11 470s: 98.6% done, RPS(now/agg): 184117/671063, errs: 0, hitrate: 0.0%, clients: 24 done_min: 93.72%, done_max: 100.00%, p99_lat(us): 1782, max_pending: 11 475s: 98.8% done, RPS(now/agg): 156475/665647, errs: 0, hitrate: 0.0%, clients: 19 done_min: 94.68%, done_max: 100.00%, p99_lat(us): 1783, max_pending: 11 480s: 99.0% done, RPS(now/agg): 148995/660265, errs: 0, hitrate: 0.0%, clients: 19 done_min: 95.65%, done_max: 100.00%, p99_lat(us): 1783, max_pending: 11 485s: 99.3% done, RPS(now/agg): 148889/654992, errs: 0, hitrate: 0.0%, clients: 19 done_min: 96.60%, done_max: 100.00%, p99_lat(us): 1784, max_pending: 11 490s: 99.5% done, RPS(now/agg): 148289/649822, errs: 0, hitrate: 0.0%, clients: 19 done_min: 97.55%, done_max: 100.00%, p99_lat(us): 1784, max_pending: 11 495s: 99.7% done, RPS(now/agg): 147537/644749, errs: 0, hitrate: 0.0%, clients: 19 done_min: 98.52%, done_max: 100.00%, p99_lat(us): 1785, max_pending: 11 500s: 100.0% done, RPS(now/agg): 145938/639761, errs: 0, hitrate: 0.0%, clients: 11 done_min: 99.51%, done_max: 100.00%, p99_lat(us): 1785, max_pending: 11 Total time: 8m21.153171955s. Overall number of requests: 320000000, QPS: 638722 Latency summary, all times are in usec: Count: 320000000 Average: 903.4699 StdDev: 2207414.49 Min: 53.0000 Median: 900.5397 Max: 13940.0000 ------------------------------------------------------ [ 50, 60 ) 98 0.000% 0.000% [ 60, 70 ) 1368 0.000% 0.000% [ 70, 80 ) 6217 0.002% 0.002% [ 80, 90 ) 17120 0.005% 0.008% [ 90, 100 ) 36010 0.011% 0.019% [ 100, 120 ) 168280 0.053% 0.072% [ 120, 140 ) 429397 0.134% 0.206% [ 140, 160 ) 868176 0.271% 0.477% [ 160, 180 ) 1513899 0.473% 0.950% [ 180, 200 ) 2299055 0.718% 1.669% [ 200, 250 ) 8282542 2.588% 4.257% # [ 250, 300 ) 10372276 3.241% 7.498% # [ 300, 350 ) 11892829 3.717% 11.215% # [ 350, 400 ) 12378963 3.868% 15.083% # [ 400, 450 ) 11577678 3.618% 18.701% # [ 450, 500 ) 10591660 3.310% 22.011% # [ 500, 600 ) 20705038 6.470% 28.481% # [ 600, 700 ) 22463042 7.020% 35.501% # [ 700, 800 ) 23769529 7.428% 42.929% # [ 800, 900 ) 22512946 7.035% 49.964% # [ 900, 1000 ) 21098245 6.593% 56.558% # [ 1000, 1200 ) 48858666 15.268% 71.826% ### [ 1200, 1400 ) 49938490 15.606% 87.432% ### [ 1400, 1600 ) 28313693 8.848% 96.280% ## [ 1600, 1800 ) 9371830 2.929% 99.208% # [ 1800, 2000 ) 1656441 0.518% 99.726% [ 2000, 2500 ) 392161 0.123% 99.849% [ 2500, 3000 ) 128840 0.040% 99.889% [ 3000, 3500 ) 121288 0.038% 99.927% [ 3500, 4000 ) 91733 0.029% 99.955% [ 4000, 4500 ) 60773 0.019% 99.974% [ 4500, 5000 ) 36645 0.011% 99.986% [ 5000, 6000 ) 30751 0.010% 99.996% [ 6000, 7000 ) 7415 0.002% 99.998% [ 7000, 8000 ) 1478 0.000% 99.998% [ 8000, 9000 ) 1072 0.000% 99.999% [ 9000, 10000 ) 1199 0.000% 99.999% [ 10000, 12000 ) 1897 0.001% 100.000% [ 12000, 14000 ) 1260 0.000% 100.000% ``` With this change: ``` Running 8 threads, sending 4000000 requests per each connection, or 320000000 requests overall At a rate of 9500 rps per connection, i.e. request every 105us Overall scheduled RPS: 760000 5s: 1.2% done, RPS(now/agg): 757514/757514, errs: 0, hitrate: 0.0%, clients: 80 done_min: 1.16%, done_max: 1.19%, p99_lat(us): 1527, max_pending: 11 10s: 2.4% done, RPS(now/agg): 753364/755439, errs: 0, hitrate: 0.0%, clients: 80 done_min: 2.27%, done_max: 2.38%, p99_lat(us): 1560, max_pending: 11 15s: 3.5% done, RPS(now/agg): 753031/754636, errs: 0, hitrate: 0.0%, clients: 80 330s: 77.6% done, RPS(now/agg): 753779/752887, errs: 0, hitrate: 0.0%, clients: 80 done_min: 74.12%, done_max: 78.38%, p99_lat(us): 1578, max_pending: 11 done_min: 96.63%, done_max: 100.00%, p99_lat(us): 1579, max_pending: 11 435s: 99.7% done, RPS(now/agg): 137773/733153, errs: 0, hitrate: 0.0%, clients: 15 done_min: 97.77%, done_max: 100.00%, p99_lat(us): 1579, max_pending: 11 440s: 99.9% done, RPS(now/agg): 134162/726347, errs: 0, hitrate: 0.0%, clients: 15 done_min: 98.88%, done_max: 100.00%, p99_lat(us): 1579, max_pending: 11 Total time: 7m23.464824086s. Overall number of requests: 320000000, QPS: 722347 Latency summary, all times are in usec: Count: 320000000 Average: 826.7950 StdDev: 2009589.73 Min: 51.0000 Median: 857.0704 Max: 23549.0000 ------------------------------------------------------ [ 50, 60 ) 95 0.000% 0.000% [ 60, 70 ) 524 0.000% 0.000% [ 70, 80 ) 1715 0.001% 0.001% [ 80, 90 ) 5620 0.002% 0.002% [ 90, 100 ) 14380 0.004% 0.007% [ 100, 120 ) 88375 0.028% 0.035% [ 120, 140 ) 270640 0.085% 0.119% [ 140, 160 ) 610742 0.191% 0.310% [ 160, 180 ) 1182863 0.370% 0.680% [ 180, 200 ) 2054392 0.642% 1.322% [ 200, 250 ) 8804939 2.752% 4.073% # [ 250, 300 ) 12475349 3.899% 7.972% # [ 300, 350 ) 15107581 4.721% 12.693% # [ 350, 400 ) 16456965 5.143% 17.836% # [ 400, 450 ) 15996109 4.999% 22.834% # [ 450, 500 ) 14600129 4.563% 27.397% # [ 500, 600 ) 25648291 8.015% 35.412% ## [ 600, 700 ) 20320301 6.350% 41.762% # [ 700, 800 ) 16566820 5.177% 46.939% # [ 800, 900 ) 17161547 5.363% 52.302% # [ 900, 1000 ) 24021013 7.507% 59.809% ## [ 1000, 1200 ) 69190350 21.622% 81.431% #### [ 1200, 1400 ) 45721447 14.288% 95.719% ### [ 1400, 1600 ) 11667667 3.646% 99.365% # [ 1600, 1800 ) 1247125 0.390% 99.755% [ 1800, 2000 ) 160430 0.050% 99.805% [ 2000, 2500 ) 133001 0.042% 99.846% [ 2500, 3000 ) 129180 0.040% 99.887% [ 3000, 3500 ) 131104 0.041% 99.928% [ 3500, 4000 ) 99134 0.031% 99.959% [ 4000, 4500 ) 60951 0.019% 99.978% [ 4500, 5000 ) 36908 0.012% 99.989% [ 5000, 6000 ) 25643 0.008% 99.997% [ 6000, 7000 ) 3980 0.001% 99.999% [ 7000, 8000 ) 2088 0.001% 99.999% [ 8000, 9000 ) 829 0.000% 99.999% [ 9000, 10000 ) 251 0.000% 100.000% [ 10000, 12000 ) 147 0.000% 100.000% [ 12000, 14000 ) 1129 0.000% 100.000% [ 14000, 16000 ) 80 0.000% 100.000% [ 18000, 20000 ) 9 0.000% 100.000% [ 20000, 25000 ) 157 0.000% 100.000% ``` Signed-off-by: Roman Gershman <roman@dragonflydb.io>
2024-12-14 11:58:02 +00:00 · 2024-11-20 08:10:16 +02:00 · 2024-11-20 08:10:16 +02:00 · 1dd59a58d2
commit 1dd59a58d2
parent 6a7f345bc5
3 changed files with 56 additions and 7 deletions
--- a/src/core/dash.h
+++ b/src/core/dash.h
@ -149,6 +149,9 @@ class DashTable : public detail::DashTableBase {
  template <typename U> const_iterator Find(U&& key) const;
  template <typename U> iterator Find(U&& key);

+  // Prefetches the memory where the key would resize into the cache.
+  template <typename U> void Prefetch(U&& key) const;
+
  // Find first entry with given key hash that evaulates to true on pred.
  // Pred accepts either (const key&) or (const key&, const value&)
  template <typename Pred> iterator FindFirst(uint64_t key_hash, Pred&& pred);
@ -699,6 +702,14 @@ auto DashTable<_Key, _Value, Policy>::Find(U&& key) -> iterator {
  return FindFirst(DoHash(key), EqPred(key));
 }

+template <typename _Key, typename _Value, typename Policy>
+template <typename U>
+void DashTable<_Key, _Value, Policy>::Prefetch(U&& key) const {
+  uint64_t key_hash = DoHash(key);
+  uint32_t seg_id = SegmentId(key_hash);
+  segment_[seg_id]->Prefetch(key_hash);
+}
+
 template <typename _Key, typename _Value, typename Policy>
 template <typename Pred>
 auto DashTable<_Key, _Value, Policy>::FindFirst(uint64_t key_hash, Pred&& pred) -> iterator {
--- a/src/core/dash_internal.h
+++ b/src/core/dash_internal.h
@ -502,6 +502,7 @@ template <typename _Key, typename _Value, typename Policy = DefaultSegmentPolicy

  // Find item with given key hash and truthy predicate
  template <typename Pred> Iterator FindIt(Hash_t key_hash, Pred&& pred) const;
+  void Prefetch(Hash_t key_hash) const;

  // Returns valid iterator if succeeded or invalid if not (it's full).
  // Requires: key should be not present in the segment.
@ -1188,6 +1189,18 @@ auto Segment<Key, Value, Policy>::FindIt(Hash_t key_hash, Pred&& pred) const ->
  return Iterator{};
 }

+template <typename Key, typename Value, typename Policy>
+void Segment<Key, Value, Policy>::Prefetch(Hash_t key_hash) const {
+  uint8_t bidx = BucketIndex(key_hash);
+  const Bucket& target = bucket_[bidx];
+  uint8_t nid = NextBid(bidx);
+  const Bucket& probe = bucket_[nid];
+
+  // Prefetch buckets that might the key with high probability.
+  __builtin_prefetch(&target, 0, 1);
+  __builtin_prefetch(&probe, 0, 1);
+}
+
 template <typename Key, typename Value, typename Policy>
 template <typename Cb>
 void Segment<Key, Value, Policy>::TraverseAll(Cb&& cb) const {
--- a/src/server/transaction.cc
+++ b/src/server/transaction.cc
@ -1171,6 +1171,8 @@ void Transaction::ScheduleBatchInShard() {
  ShardId sid = shard->shard_id();
  auto& sq = schedule_queues[sid];

+  array<ScheduleContext*, 32> batch;
+
  for (unsigned j = 0;; ++j) {
    // We pull the items from the queue in a loop until we reach the stop condition.
    // TODO: we may have fairness problem here, where transactions being added up all the time
@ -1178,16 +1180,39 @@ void Transaction::ScheduleBatchInShard() {
    // because we must ensure that there is another ScheduleBatchInShard callback in the queue.
    // Can be checked with testing sq.armed is true when j == 1.
    while (true) {
-      ScheduleContext* item = sq.queue.Pop();
-      if (!item)
+      unsigned len = 0;
+      for (; len < batch.size(); ++len) {
+        ScheduleContext* item = sq.queue.Pop();
+        if (!item)
+          break;
+        batch[len] = item;
+        if (!item->trans->IsGlobal()) {
+          auto shard_args = item->trans->GetShardArgs(sid);
+          // Can be empty if the transaction is not touching any keys and is
+          // NO_KEY_TRANSACTIONAL.
+          if (!shard_args.Empty()) {
+            auto& db_slice = item->trans->GetDbSlice(shard->shard_id());
+
+            // We could prefetch all the keys but this is enough to test the optimization for
+            // single key operations.
+            db_slice.GetDBTable(item->trans->GetDbIndex())->prime.Prefetch(shard_args.Front());
+          }
+        }
+      }
+
+      if (len == 0)
        break;

-      if (!item->trans->ScheduleInShard(shard, item->optimistic_execution)) {
-        item->fail_cnt.fetch_add(1, memory_order_relaxed);
+      stats.tx_batch_scheduled_items_total += len;
+
+      for (unsigned i = 0; i < len; ++i) {
+        ScheduleContext* item = batch[i];
+        if (!item->trans->ScheduleInShard(shard, item->optimistic_execution)) {
+          item->fail_cnt.fetch_add(1, memory_order_relaxed);
+        }
+        item->trans->FinishHop();
      }
-      item->trans->FinishHop();
-      stats.tx_batch_scheduled_items_total++;
-    };
+    }

    // j==1 means we already signalled that we're done with the current batch.
    if (j == 1)