Spinlock versus mutex queue lock tests

RainerZ · RainerZ · commit 6f26b29265a3 · 2024-09-29T21:28:25.000+02:00
diff --git a/README.md b/README.md
@@ -268,7 +268,7 @@ All measurement and calibration code instrumentation is non blocking and the tri
 There are no heap allocation during runtime, except for the lazy registrations of and for A2L generation.
   
 build.rs automatically builds a minimum static C library from individially preconfigured core XCPlite sources.   
-On C level, there is a synchronisation mutex or spinlock for the mpsc transmit queue.  
+On C level, there is a synchronisation mutex for the mpsc transmit queue.  
 The C code has the option to start the server with 2 normal threads for rx and tx socket handling.
 
 The generated A2L file is finalized on XCP connect and provided for upload via XCP. 
diff --git a/tests/test_multi_thread.rs b/tests/test_multi_thread.rs
@@ -2,7 +2,7 @@
 // Integration test for XCP in a multi threaded application
 // Uses the test XCP client in xcp_client
 
-// cargo test --features=json --features=auto_reg -- --test-threads=1 --nocapture  --test test_multi_thread
+// cargo test --features=json --features=auto_reg --features=a2l_reader -- --test-threads=1 --nocapture  --test test_multi_thread
 
 #![allow(unused_assignments)]
 
diff --git a/tests/test_single_thread.rs b/tests/test_single_thread.rs
@@ -2,8 +2,7 @@
 // Integration test for XCP in a single thread application
 // Uses the test XCP client in module xcp_client
 
-// cargo test --features=json --features=auto_reg -- --test-threads=1 --nocapture  --test test_single_thread
-
+// cargo test --features=json --features=auto_reg --features=a2l_reader -- --test-threads=1 --nocapture  --test test_single_thread
 use xcp::*;
 use xcp_type_description::prelude::*;
 
diff --git a/xcplib/src/platform.h b/xcplib/src/platform.h
@@ -81,9 +81,10 @@ typedef HANDLE tXcpThread;
 #elif defined(_LINUX) // Linux
 
 typedef pthread_t tXcpThread;
-#define create_thread(h,t) pthread_create(h, NULL, t, NULL);
-#define join_thread(h) pthread_join(h,NULL);
+#define create_thread(h,t) pthread_create(h, NULL, t, NULL)
+#define join_thread(h) pthread_join(h,NULL)
 #define cancel_thread(h) { pthread_detach(h); pthread_cancel(h); }
+#define yield_thread() sched_yield()
 
 #endif
 
diff --git a/xcplib/src/xcpTlQueue.c b/xcplib/src/xcpTlQueue.c
@@ -16,21 +16,57 @@
 #include "dbg_print.h"
 #include "xcpLite.h"   
 
+// Experimental
+// Use spinlock/mutex instead of mutex for producer lock
+// This naiv approach is usually not faster compared to a mutex and can produce higher latencies and hard to predict impact on other threads
+// It might be a better solution for non preemptive tasks
+//#define USE_SPINLOCK
+//#define USE_YIELD
+//#define TEST_LOCK_TIMING
+
+/* 
+Test results from test_multi_thread with 32 tasks and 200us sleep time:   
+maxLock and avgLock time in ns
+
+SPINLOCK+YIELD
+    lockCount=501170, maxLock=296000, avgLock=768
+    lockCount=501019, maxLock=195000, avgLock=744
+    lockCount=500966, maxLock=210000, avgLock=724
+
+SPINLOCK without cache friendly lock check
+    lockCount=492952, maxLock=10115000, avgLock=1541
+
+SPINLOCK
+    lockCount=497254, maxLock=9935000, avgLock=512
+    lockCount=494866, maxLock=11935000, avgLock=1322
+    lockCount=490923, maxLock=10019000, avgLock=2073
+    lockCount=489831, maxLock=10024000, avgLock=1980
+
+MUTEX
+    lockCount=499798, maxLock=114000, avgLock=840
+    lockCount=500202, maxLock=135000, avgLock=806
+    lockCount=499972, maxLock=130000, avgLock=790
+    lockCount=500703, maxLock=124000, avgLock=755
+    lockCount=500773, maxLock=126000, avgLock=669
+*/
+
+#ifdef TEST_LOCK_TIMING
+static uint64_t lockTimeMax = 0;
+static uint64_t lockTimeSum = 0;
+static uint64_t lockCount = 0;
+#endif
 
 #ifndef _WIN
 
 #include <stdatomic.h>
 
-// Use spinlock instead of mutex for producer lock
-#define USE_SPINLOCK
-
 #else
 
 #ifdef _WIN32_
 #error "Windows32 not implemented yet"
 #else
 
-
+#undef USE_SPINLOCK
 #define atomic_uint_fast64_t uint64_t
 #define atomic_store(a,b) (*a)=(b)
 #define atomic_load(a) (*a)
@@ -100,6 +136,10 @@ void XcpTlFreeTransmitQueue() {
 #ifndef USE_SPINLOCK
     mutexDestroy(&gXcpTlQueue.mutex);
 #endif
+    
+#ifdef TEST_LOCK_TIMING
+    DBG_PRINTF3("XcpTlFreeTransmitQueue: overruns=%u, lockCount=%llu, maxLock=%llu, avgLock=%llu\n", gXcpTlQueue.overruns, lockCount, lockTimeMax, lockTimeSum/lockCount);
+#endif
 }
 
 
@@ -127,11 +167,27 @@ uint8_t* XcpTlGetTransmitBuffer(void** handle, uint16_t packet_len) {
     DBG_PRINTF5("XcpTlGetTransmitBuffer: len=%d\n", packet_len);
 
     // Producer lock
+#ifdef TEST_LOCK_TIMING
+    uint64_t c = clockGet();
+#endif
 #ifdef USE_SPINLOCK
-    while (atomic_flag_test_and_set_explicit(&lock, memory_order_acquire));
+    for (uint32_t n = 1;1;n++) {
+        BOOL locked = atomic_load_explicit(&lock._Value, memory_order_relaxed);
+        if (!locked  && !atomic_flag_test_and_set_explicit(&lock, memory_order_acquire)) break;
+        //if ( !atomic_flag_test_and_set_explicit(&lock, memory_order_acquire)) break;
+    #ifdef USE_YIELD
+        if (n%16==0) yield_thread();
+    #endif
+    }
 #else
     mutexLock(&gXcpTlQueue.mutex);
 #endif
+#ifdef TEST_LOCK_TIMING
+    uint64_t d = clockGet() - c;
+    if (d>lockTimeMax) lockTimeMax = d;
+    lockTimeSum += d;
+    lockCount++;
+#endif
 
     uint64_t head = atomic_load(&gXcpTlQueue.head);
     uint64_t tail = atomic_load_explicit(&gXcpTlQueue.tail,memory_order_relaxed);