update bench report

Signed-off-by: wineway <wangyuweihx@gmail.com>
wineway · Aug 20, 2024 · eb6fb3a · eb6fb3a
1 parent 873f447
commit eb6fb3a
Show file tree

Hide file tree

Showing 6 changed files with 92 additions and 33 deletions.
diff --git a/src/common/ring.hpp b/src/common/ring.hpp
@@ -5,8 +5,8 @@
 
 #include <cstdint>
 
-#include "ring_generic_allocator.hpp"
 #include "log.hpp"
+#include "ring_generic_allocator.hpp"
 
 static_assert(sizeof(std::atomic<uint32_t>) == sizeof(uint32_t), "");
 static_assert(sizeof(std::atomic<uint64_t>) == sizeof(uint64_t), "");
@@ -244,20 +244,38 @@ struct Ring {
     ) {
         bool success {};
         uint32_t max = n;
+        uint32_t cons_tail;
+        const uint32_t capacity = this->capacity;
+
+        old_head = std::atomic_load_explicit(
+            reinterpret_cast<volatile std::atomic<uint32_t>*>(&this->prod_.head_
+            ),
+            std::memory_order_relaxed
+        );
         do {
+            /* Reset n to the initial burst count */
             n = max;
-            old_head = this->prod_.head_;
-            /* add rmb barrier to avoid load/load reorder in weak
-             * memory model. It is noop on x86
+
+            /* Ensure the head is read before tail */
+            std::atomic_thread_fence(std::memory_order_acquire);
+
+            /* load-acquire synchronize with store-release of ht->tail
+             * in update_tail.
              */
-            sqk_smp_rmb();
+            cons_tail = std::atomic_load_explicit(
+                reinterpret_cast<volatile std::atomic<uint32_t>*>(
+                    &this->cons_.tail_
+                ),
+                std::memory_order_acquire
+            );
+
             /*
              *  The subtraction is done between two unsigned 32bits value
              * (the result is always modulo 32 bits even if we have
              * *old_head > cons_tail). So 'free_entries' is always between 0
              * and capacity (which is < size).
              */
-            free_entries = (this->capacity + this->cons_.tail_ - old_head);
+            free_entries = (capacity + cons_tail - old_head);
             /* check that we have enough room in ring */
             if (unlikely(n > free_entries)) {
                 if constexpr (transactional_prod) {
@@ -266,17 +284,21 @@ struct Ring {
                     n = free_entries;
                 }
             }
+
             if (n == 0) {
                 return 0;
             }
+
             new_head = old_head + n;
             if constexpr (prod_sync_type == RingSyncType::SQK_RING_SYNC_MT) {
-                success = std::atomic_compare_exchange_strong(
+                success = std::atomic_compare_exchange_strong_explicit(
                     reinterpret_cast<volatile std::atomic<uint32_t>*>(
                         &this->prod_.head_
                     ),
                     &old_head,
-                    new_head
+                    new_head,
+                    std::memory_order_relaxed,
+                    std::memory_order_relaxed
                 );
             } else {
                 this->prod_.head_ = new_head, success = 1;
@@ -607,26 +629,40 @@ struct Ring {
         uint32_t& entries
     ) {
         unsigned int max = n;
+        uint32_t prod_tail;
         int success;
 
         /* move cons.head atomically */
+        old_head = std::atomic_load_explicit(
+            reinterpret_cast<volatile std::atomic<uint32_t>*>(&this->cons_.head_
+            ),
+            std::memory_order_relaxed
+        );
         do {
             /* Restore n as it may change every loop */
             n = max;
 
             old_head = this->cons_.head_;
 
-            /* add rmb barrier to avoid load/load reorder in weak
-             * memory model. It is noop on x86
+            /* Ensure the head is read before tail */
+            std::atomic_thread_fence(std::memory_order_acquire);
+
+            /* this load-acquire synchronize with store-release of ht->tail
+             * in update_tail.
              */
-            sqk_smp_rmb();
+            prod_tail = std::atomic_load_explicit(
+                reinterpret_cast<volatile std::atomic<uint32_t>*>(
+                    &this->prod_.tail_
+                ),
+                std::memory_order_acquire
+            );
 
             /* The subtraction is done between two unsigned 32bits value
              * (the result is always modulo 32 bits even if we have
              * cons_head > prod_tail). So 'entries' is always between 0
              * and size(ring)-1.
              */
-            entries = (this->prod_.tail_ - old_head);
+            entries = (prod_tail - old_head);
 
             /* Set the actual entries for dequeue */
             if (n > entries) {
@@ -643,7 +679,6 @@ struct Ring {
             new_head = old_head + n;
             if constexpr (cons_sync_type == RingSyncType::SQK_RING_SYNC_ST) {
                 this->cons_.head_ = new_head;
-                sqk_smp_rmb();
                 success = 1;
             } else {
                 success = std::atomic_compare_exchange_strong_explicit(
@@ -830,7 +865,13 @@ struct Ring {
                 return 0;
             }
             this->enqueue_elements(prod_head, &entry, n);
-            this->update_tail(this->prod_, prod_head, prod_next, prod_sync_type == RingSyncType::SQK_RING_SYNC_ST, 1);
+            this->update_tail(
+                this->prod_,
+                prod_head,
+                prod_next,
+                prod_sync_type == RingSyncType::SQK_RING_SYNC_ST,
+                1
+            );
             return n;
         } else if constexpr (prod_sync_type
                              == RingSyncType::SQK_RING_SYNC_MT_HTS) {
@@ -859,7 +900,13 @@ struct Ring {
                 return 0;
             }
             this->dequeue_elements(cons_head, &entry, n);
-            this->update_tail(this->cons_, cons_head, cons_next, cons_sync_type == RingSyncType::SQK_RING_SYNC_ST, 0);
+            this->update_tail(
+                this->cons_,
+                cons_head,
+                cons_next,
+                cons_sync_type == RingSyncType::SQK_RING_SYNC_ST,
+                0
+            );
             return n;
         } else if constexpr (cons_sync_type
                              == RingSyncType::SQK_RING_SYNC_MT_HTS) {
@@ -882,7 +929,8 @@ struct Ring {
 };
 
 template<typename T>
-using MpscRing = Ring<T, RingSyncType::SQK_RING_SYNC_MT, RingSyncType::SQK_RING_SYNC_ST>;
+using MpscRing =
+    Ring<T, RingSyncType::SQK_RING_SYNC_MT, RingSyncType::SQK_RING_SYNC_ST>;
 
 template<typename RingType>
 struct RingGuard {

diff --git a/src/tests/common/report.md b/src/tests/common/report.md
@@ -0,0 +1,13 @@
+|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
+|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
+|               14.39 |       69,492,842.98 |    0.0% |           48.02 |           30.19 |  1.591 |           7.00 |    0.0% |      1.72 | `mpsc_ring enqueue`
+|                4.36 |      229,421,611.95 |    0.0% |           41.02 |            9.14 |  4.487 |           4.00 |    0.0% |      0.52 | `spsc_ring enqueue`
+|               16.88 |       59,254,109.36 |    0.0% |           51.02 |           35.40 |  1.441 |           7.00 |    0.0% |      2.02 | `hts mpsc_ring enqueue`
+|                2.80 |      357,013,044.65 |    0.6% |           23.65 |            5.88 |  4.025 |           4.30 |    0.2% |      0.33 | `deque enqueue`
+|               17.90 |       55,876,668.16 |    0.1% |          186.00 |           37.54 |  4.955 |          42.00 |    0.0% |      2.14 | `list enqueue`
+|               14.35 |       69,682,196.62 |    0.0% |           48.02 |           30.10 |  1.595 |           7.00 |    0.0% |      1.71 | `mpsc_ring enqueue`
+|                4.33 |      231,032,774.43 |    0.0% |           41.02 |            9.08 |  4.518 |           4.00 |    0.0% |      0.52 | `spsc_ring enqueue`
+|               16.86 |       59,303,030.30 |    0.0% |           51.02 |           35.37 |  1.442 |           7.00 |    0.0% |      2.01 | `hts mpsc_ring enqueue`
+|                2.76 |      362,186,470.36 |    0.2% |           25.30 |            5.79 |  4.368 |           4.61 |    0.1% |      0.33 | `deque enqueue`
+|               17.14 |       58,357,336.08 |    0.1% |          186.00 |           35.94 |  5.175 |          42.00 |    0.0% |      2.05 | `list enqueue`
+
diff --git a/src/tests/common/report.txt b/src/tests/common/report.txt
diff --git a/src/tests/common/ring_bench.cc b/src/tests/common/ring_bench.cc
@@ -17,6 +17,7 @@ int main(int argc, char* argv[]) {
                 guard->enqueue(1);
                 int i;
                 guard->dequeue(i);
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -31,6 +32,7 @@ int main(int argc, char* argv[]) {
                 guard->enqueue(1);
                 int i;
                 guard->dequeue(i);
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -45,6 +47,7 @@ int main(int argc, char* argv[]) {
                 guard->enqueue(1);
                 int i;
                 guard->dequeue(i);
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -55,6 +58,7 @@ int main(int argc, char* argv[]) {
                 deq.push_back(1);
                 int i = deq.front();
                 deq.pop_front();
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -65,6 +69,7 @@ int main(int argc, char* argv[]) {
                 deq.push_back(1);
                 int i = deq.front();
                 deq.pop_front();
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -75,6 +80,7 @@ int main(int argc, char* argv[]) {
                 guard->enqueue(1);
                 uint64_t i;
                 guard->dequeue(i);
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -89,6 +95,7 @@ int main(int argc, char* argv[]) {
                 guard->enqueue(1);
                 uint64_t i;
                 guard->dequeue(i);
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -103,6 +110,7 @@ int main(int argc, char* argv[]) {
                 guard->enqueue(1);
                 uint64_t i;
                 guard->dequeue(i);
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -113,6 +121,7 @@ int main(int argc, char* argv[]) {
                 deq.push_back(1);
                 uint64_t i = deq.front();
                 deq.pop_front();
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
     {
@@ -123,6 +132,7 @@ int main(int argc, char* argv[]) {
                 deq.push_back(1);
                 uint64_t i = deq.front();
                 deq.pop_front();
+                ankerl::nanobench::doNotOptimizeAway(i);
             });
     }
 

diff --git a/src/tests/core/report.md b/src/tests/core/report.md
@@ -0,0 +1,5 @@
+|               ns/op |                op/s |    err% |          ins/op |          cyc/op |    IPC |         bra/op |   miss% |     total | benchmark
+|--------------------:|--------------------:|--------:|----------------:|----------------:|-------:|---------------:|--------:|----------:|:----------
+|               22.56 |       44,317,419.84 |    0.2% |          242.02 |           47.33 |  5.113 |          49.01 |    0.0% |      0.27 | `sqk::scheduler benchmark`
+|                2.89 |      346,185,355.98 |    0.0% |            2.00 |            6.06 |  0.330 |           0.00 |   47.8% |      0.03 | `function benchmark`
+|           32,159.95 |           31,094.57 |    0.6% |        2,031.68 |        2,782.61 |  0.730 |         474.31 |    1.0% |      0.39 | `thread benchmark`
diff --git a/src/tests/core/report.txt b/src/tests/core/report.txt