Skip to content

Commit 8ad8180

Browse files
Fix timer cancellation to be reliable in LayerImplSelect.
To minimize risk, the changes here keep the "grab all the timers we should fire, then fire them" setup instead of switching to the "fire the timers one at a time" approach LayerImplFreeRTOS uses. The fix consists of the following parts: 1) Store the list of timers to fire in a member, so we can cancel things from that list as needed. 2) To avoid canceling things scheduled by ScheduleWork, remove the CancelTimer call in ScheduleWork. This does mean we now allow multiple timers for the same callback+appState in the timer list, if they were created by ScheduleWork, but that should be OK, since the only reason that pair needs to be unique is to allow cancellation and we never want to cancel the things ScheduleWork schedules. As a followup we should stop using the timer list for ScheduleWork altogether and use ScheduleLambda like LayerImplFreeRTOS does, but that involves fixing the unit tests that call ScheduleWork without actually running the platfor manager event loop and expect it to work somehow. TestRead was failing the sanity assert for not losing track of timers to fire, because it was spinning a nested event loop. The changes to that test stop it from doing that. Fixes project-chip#19387 Fixes project-chip#22160
1 parent 8d3f0cc commit 8ad8180

8 files changed

+183
-34
lines changed

src/controller/tests/data_model/TestRead.cpp

+33-15
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,11 @@ class TestReadInteraction : public app::ReadHandler::ApplicationCallback
298298
// succeed.
299299
static void MultipleReadHelper(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount);
300300

301+
// Helper for MultipleReadHelper that does not spin the event loop, so we
302+
// don't end up with nested event loops.
303+
static void MultipleReadHelperGuts(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount, uint32_t & aNumSuccessCalls,
304+
uint32_t & aNumFailureCalls);
305+
301306
// Establish the given number of subscriptions, then issue the given number
302307
// of reads in parallel and wait for them all to succeed.
303308
static void SubscribeThenReadHelper(nlTestSuite * apSuite, TestContext & aCtx, size_t aSubscribeCount, size_t aReadCount);
@@ -2484,6 +2489,9 @@ void TestReadInteraction::SubscribeThenReadHelper(nlTestSuite * apSuite, TestCon
24842489
uint32_t numSuccessCalls = 0;
24852490
uint32_t numSubscriptionEstablishedCalls = 0;
24862491

2492+
uint32_t numReadSuccessCalls = 0;
2493+
uint32_t numReadFailureCalls = 0;
2494+
24872495
responseDirective = kSendDataResponse;
24882496

24892497
// Passing of stack variables by reference is only safe because of synchronous completion of the interaction. Otherwise, it's
@@ -2501,12 +2509,12 @@ void TestReadInteraction::SubscribeThenReadHelper(nlTestSuite * apSuite, TestCon
25012509
NL_TEST_ASSERT(apSuite, false);
25022510
};
25032511

2504-
auto onSubscriptionEstablishedCb = [&numSubscriptionEstablishedCalls, &apSuite, &aCtx, aSubscribeCount,
2505-
aReadCount](const app::ReadClient & readClient) {
2512+
auto onSubscriptionEstablishedCb = [&numSubscriptionEstablishedCalls, &apSuite, &aCtx, aSubscribeCount, aReadCount,
2513+
&numReadSuccessCalls, &numReadFailureCalls](const app::ReadClient & readClient) {
25062514
numSubscriptionEstablishedCalls++;
25072515
if (numSubscriptionEstablishedCalls == aSubscribeCount)
25082516
{
2509-
MultipleReadHelper(apSuite, aCtx, aReadCount);
2517+
MultipleReadHelperGuts(apSuite, aCtx, aReadCount, numReadSuccessCalls, numReadFailureCalls);
25102518
}
25112519
};
25122520

@@ -2522,40 +2530,50 @@ void TestReadInteraction::SubscribeThenReadHelper(nlTestSuite * apSuite, TestCon
25222530

25232531
NL_TEST_ASSERT(apSuite, numSuccessCalls == aSubscribeCount);
25242532
NL_TEST_ASSERT(apSuite, numSubscriptionEstablishedCalls == aSubscribeCount);
2533+
NL_TEST_ASSERT(apSuite, numReadSuccessCalls == aReadCount);
2534+
NL_TEST_ASSERT(apSuite, numReadFailureCalls == 0);
25252535
}
25262536

2527-
void TestReadInteraction::MultipleReadHelper(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount)
2537+
// The guts of MultipleReadHelper which take references to the success/failure
2538+
// counts to modify and assume the consumer will be spinning the event loop.
2539+
void TestReadInteraction::MultipleReadHelperGuts(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount,
2540+
uint32_t & aNumSuccessCalls, uint32_t & aNumFailureCalls)
25282541
{
2529-
auto sessionHandle = aCtx.GetSessionBobToAlice();
2530-
uint32_t numSuccessCalls = 0;
2531-
uint32_t numFailureCalls = 0;
2542+
NL_TEST_ASSERT(apSuite, aNumSuccessCalls == 0);
2543+
NL_TEST_ASSERT(apSuite, aNumFailureCalls == 0);
2544+
2545+
auto sessionHandle = aCtx.GetSessionBobToAlice();
25322546

25332547
responseDirective = kSendDataResponse;
25342548

25352549
uint16_t firstExpectedResponse = totalReadCount + 1;
25362550

2537-
// Passing of stack variables by reference is only safe because of synchronous completion of the interaction. Otherwise, it's
2538-
// not safe to do so.
2539-
auto onFailureCb = [&apSuite, &numFailureCalls](const app::ConcreteDataAttributePath * attributePath, CHIP_ERROR aError) {
2540-
numFailureCalls++;
2551+
auto onFailureCb = [apSuite, &aNumFailureCalls](const app::ConcreteDataAttributePath * attributePath, CHIP_ERROR aError) {
2552+
aNumFailureCalls++;
25412553

25422554
NL_TEST_ASSERT(apSuite, attributePath == nullptr);
25432555
};
25442556

25452557
for (size_t i = 0; i < aReadCount; ++i)
25462558
{
2547-
// Passing of stack variables by reference is only safe because of synchronous completion of the interaction. Otherwise,
2548-
// it's not safe to do so.
2549-
auto onSuccessCb = [&numSuccessCalls, &apSuite, firstExpectedResponse,
2559+
auto onSuccessCb = [&aNumSuccessCalls, apSuite, firstExpectedResponse,
25502560
i](const app::ConcreteDataAttributePath & attributePath, const auto & dataResponse) {
25512561
NL_TEST_ASSERT(apSuite, dataResponse == firstExpectedResponse + i);
2552-
numSuccessCalls++;
2562+
aNumSuccessCalls++;
25532563
};
25542564

25552565
NL_TEST_ASSERT(apSuite,
25562566
Controller::ReadAttribute<TestCluster::Attributes::Int16u::TypeInfo>(
25572567
&aCtx.GetExchangeManager(), sessionHandle, kTestEndpointId, onSuccessCb, onFailureCb) == CHIP_NO_ERROR);
25582568
}
2569+
}
2570+
2571+
void TestReadInteraction::MultipleReadHelper(nlTestSuite * apSuite, TestContext & aCtx, size_t aReadCount)
2572+
{
2573+
uint32_t numSuccessCalls = 0;
2574+
uint32_t numFailureCalls = 0;
2575+
2576+
MultipleReadHelperGuts(apSuite, aCtx, aReadCount, numSuccessCalls, numFailureCalls);
25592577

25602578
aCtx.DrainAndServiceIO();
25612579

src/system/SystemLayerImplFreeRTOS.cpp

+23-1
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,32 @@ CHIP_ERROR LayerImplFreeRTOS::ScheduleWork(TimerCompleteCallback onComplete, voi
8888
{
8989
VerifyOrReturnError(mLayerState.IsInitialized(), CHIP_ERROR_INCORRECT_STATE);
9090

91+
// Ideally we would not use a timer here at all, but if we try to just
92+
// ScheduleLambda the lambda needs to capture the following:
93+
// 1) onComplete
94+
// 2) appState
95+
// 3) The `this` pointer, because onComplete needs to be passed a pointer to
96+
// the System::Layer.
97+
//
98+
// On a 64-bit system that's 24 bytes, but lambdas passed to ScheduleLambda
99+
// are capped at CHIP_CONFIG_LAMBDA_EVENT_SIZE which is 16 bytes.
100+
//
101+
// So for now use a timer as a poor-man's closure that captures `this` and
102+
// onComplete and appState in a single pointer, so we fit inside the size
103+
// limit.
104+
//
105+
// TODO: We could do something here where we compile-time condition on the
106+
// sizes of things and use a direct ScheduleLambda if it would fit and this
107+
// setup otherwise.
91108
TimerList::Node * timer = mTimerPool.Create(*this, SystemClock().GetMonotonicTimestamp(), onComplete, appState);
92109
VerifyOrReturnError(timer != nullptr, CHIP_ERROR_NO_MEMORY);
93110

94-
return ScheduleLambda([this, timer] { this->mTimerPool.Invoke(timer); });
111+
CHIP_ERROR err = ScheduleLambda([this, timer] { this->mTimerPool.Invoke(timer); });
112+
if (err != CHIP_NO_ERROR)
113+
{
114+
mTimerPool.Release(timer);
115+
}
116+
return err;
95117
}
96118

97119
/**

src/system/SystemLayerImplSelect.cpp

+32-4
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,10 @@ void LayerImplSelect::CancelTimer(TimerCompleteCallback onComplete, void * appSt
170170
VerifyOrReturn(mLayerState.IsInitialized());
171171

172172
TimerList::Node * timer = mTimerList.Remove(onComplete, appState);
173+
if (timer == nullptr)
174+
{
175+
timer = mExpiredTimers.Remove(onComplete, appState);
176+
}
173177
VerifyOrReturn(timer != nullptr);
174178

175179
#if CHIP_SYSTEM_CONFIG_USE_DISPATCH
@@ -199,8 +203,31 @@ CHIP_ERROR LayerImplSelect::ScheduleWork(TimerCompleteCallback onComplete, void
199203
}
200204
#endif // CHIP_SYSTEM_CONFIG_USE_DISPATCH
201205

202-
CancelTimer(onComplete, appState);
203-
206+
// Ideally we would not use a timer here at all, but if we try to just
207+
// ScheduleLambda the lambda needs to capture the following:
208+
// 1) onComplete
209+
// 2) appState
210+
// 3) The `this` pointer, because onComplete needs to be passed a pointer to
211+
// the System::Layer.
212+
//
213+
// On a 64-bit system that's 24 bytes, but lambdas passed to ScheduleLambda
214+
// are capped at CHIP_CONFIG_LAMBDA_EVENT_SIZE which is 16 bytes.
215+
//
216+
// So for now use a timer as a poor-man's closure that captures `this` and
217+
// onComplete and appState in a single pointer, so we fit inside the size
218+
// limit.
219+
//
220+
// TODO: We could do something here where we compile-time condition on the
221+
// sizes of things and use a direct ScheduleLambda if it would fit and this
222+
// setup otherwise.
223+
//
224+
// TODO: But also, unit tests seem to do SystemLayer::ScheduleWork without
225+
// actually running a useful event loop (in the PlatformManager sense),
226+
// which breaks if we use ScheduleLambda here, since that does rely on the
227+
// PlatformManager event loop. So for now, keep scheduling an expires-ASAP
228+
// timer, but just make sure we don't cancel existing timers with the same
229+
// callback and appState, so ScheduleWork invocations don't stomp on each
230+
// other.
204231
TimerList::Node * timer = mTimerPool.Create(*this, SystemClock().GetMonotonicTimestamp(), onComplete, appState);
205232
VerifyOrReturnError(timer != nullptr, CHIP_ERROR_NO_MEMORY);
206233

@@ -469,9 +496,10 @@ void LayerImplSelect::HandleEvents()
469496

470497
// Obtain the list of currently expired timers. Any new timers added by timer callback are NOT handled on this pass,
471498
// since that could result in infinite handling of new timers blocking any other progress.
472-
TimerList expiredTimers = mTimerList.ExtractEarlier(Clock::Timeout(1) + SystemClock().GetMonotonicTimestamp());
499+
VerifyOrDieWithMsg(mExpiredTimers.Empty(), DeviceLayer, "Re-entry into HandleEvents from a timer callback?");
500+
mExpiredTimers = mTimerList.ExtractEarlier(Clock::Timeout(1) + SystemClock().GetMonotonicTimestamp());
473501
TimerList::Node * timer = nullptr;
474-
while ((timer = expiredTimers.PopEarliest()) != nullptr)
502+
while ((timer = mExpiredTimers.PopEarliest()) != nullptr)
475503
{
476504
mTimerPool.Invoke(timer);
477505
}

src/system/SystemLayerImplSelect.h

+3
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ class LayerImplSelect : public LayerSocketsLoop
101101

102102
TimerPool<TimerList::Node> mTimerPool;
103103
TimerList mTimerList;
104+
// List of expired times being processed right now. Stored in a member so
105+
// we can cancel them.
106+
TimerList mExpiredTimers;
104107
timeval mNextTimeout;
105108

106109
// Members for select loop

src/system/tests/BUILD.gn

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ chip_test_suite("tests") {
2626
"TestSystemErrorStr.cpp",
2727
"TestSystemPacketBuffer.cpp",
2828
"TestSystemScheduleLambda.cpp",
29+
"TestSystemScheduleWork.cpp",
2930
"TestSystemTimer.cpp",
3031
"TestSystemWakeEvent.cpp",
3132
"TestTimeSource.cpp",

src/system/tests/TestSystemClock.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ static const nlTest sTests[] =
118118
int TestSystemClock(void)
119119
{
120120
nlTestSuite theSuite = {
121-
"chip-timesource", &sTests[0], nullptr /* setup */, nullptr /* teardown */
121+
"chip-systemclock", &sTests[0], nullptr /* setup */, nullptr /* teardown */
122122
};
123123

124124
// Run test suit againt one context.

src/system/tests/TestSystemScheduleWork.cpp

+26-13
Original file line numberDiff line numberDiff line change
@@ -22,18 +22,25 @@
2222
#include <nlunit-test.h>
2323
#include <platform/CHIPDeviceLayer.h>
2424

25-
// Test input data.
25+
static void IncrementIntCounter(chip::System::Layer *, void * state)
26+
{
27+
++(*static_cast<int *>(state));
28+
}
2629

27-
static void CheckScheduleLambda(nlTestSuite * inSuite, void * aContext)
30+
static void StopEventLoop(chip::System::Layer *, void *)
2831
{
29-
bool * called = new bool(false);
30-
chip::DeviceLayer::SystemLayer().ScheduleLambda([called] {
31-
*called = true;
32-
chip::DeviceLayer::PlatformMgr().StopEventLoopTask();
33-
});
32+
chip::DeviceLayer::PlatformMgr().StopEventLoopTask();
33+
}
34+
35+
static void CheckScheduleWorkTwice(nlTestSuite * inSuite, void * aContext)
36+
{
37+
int * callCount = new int(0);
38+
NL_TEST_ASSERT(inSuite, chip::DeviceLayer::SystemLayer().ScheduleWork(IncrementIntCounter, callCount) == CHIP_NO_ERROR);
39+
NL_TEST_ASSERT(inSuite, chip::DeviceLayer::SystemLayer().ScheduleWork(IncrementIntCounter, callCount) == CHIP_NO_ERROR);
40+
NL_TEST_ASSERT(inSuite, chip::DeviceLayer::SystemLayer().ScheduleWork(StopEventLoop, nullptr) == CHIP_NO_ERROR);
3441
chip::DeviceLayer::PlatformMgr().RunEventLoop();
35-
NL_TEST_ASSERT(inSuite, *called);
36-
delete called;
42+
NL_TEST_ASSERT(inSuite, *callCount == 2);
43+
delete callCount;
3744
}
3845

3946
// Test Suite
@@ -44,7 +51,7 @@ static void CheckScheduleLambda(nlTestSuite * inSuite, void * aContext)
4451
// clang-format off
4552
static const nlTest sTests[] =
4653
{
47-
NL_TEST_DEF("System::TestScheduleLambda", CheckScheduleLambda),
54+
NL_TEST_DEF("System::TestScheduleWorkTwice", CheckScheduleWorkTwice),
4855
NL_TEST_SENTINEL()
4956
};
5057
// clang-format on
@@ -55,7 +62,7 @@ static int TestTeardown(void * aContext);
5562
// clang-format off
5663
static nlTestSuite kTheSuite =
5764
{
58-
"chip-system-schedule-lambda",
65+
"chip-system-schedule-work",
5966
&sTests[0],
6067
TestSetup,
6168
TestTeardown
@@ -67,6 +74,11 @@ static nlTestSuite kTheSuite =
6774
*/
6875
static int TestSetup(void * aContext)
6976
{
77+
if (chip::Platform::MemoryInit() != CHIP_NO_ERROR)
78+
{
79+
return FAILURE;
80+
}
81+
7082
if (chip::DeviceLayer::PlatformMgr().InitChipStack() != CHIP_NO_ERROR)
7183
return FAILURE;
7284

@@ -80,15 +92,16 @@ static int TestSetup(void * aContext)
8092
static int TestTeardown(void * aContext)
8193
{
8294
chip::DeviceLayer::PlatformMgr().Shutdown();
95+
chip::Platform::MemoryShutdown();
8396
return (SUCCESS);
8497
}
8598

86-
int TestSystemScheduleLambda(void)
99+
int TestSystemScheduleWork(void)
87100
{
88101
// Run test suit againt one lContext.
89102
nlTestRunner(&kTheSuite, nullptr);
90103

91104
return nlTestRunnerStats(&kTheSuite);
92105
}
93106

94-
CHIP_REGISTER_TEST_SUITE(TestSystemScheduleLambda)
107+
CHIP_REGISTER_TEST_SUITE(TestSystemScheduleWork)

src/system/tests/TestSystemTimer.cpp

+64
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,69 @@ void CheckOrder(nlTestSuite * inSuite, void * aContext)
257257
Clock::Internal::SetSystemClockForTesting(savedClock);
258258
}
259259

260+
void CheckCancellation(nlTestSuite * inSuite, void * aContext)
261+
{
262+
if (!LayerEvents<LayerImpl>::HasServiceEvents())
263+
return;
264+
265+
TestContext & testContext = *static_cast<TestContext *>(aContext);
266+
Layer & systemLayer = *testContext.mLayer;
267+
nlTestSuite * const suite = testContext.mTestSuite;
268+
269+
struct TestState
270+
{
271+
TestState(Layer & systemLayer) : systemLayer(systemLayer) {}
272+
273+
void Record(char c)
274+
{
275+
size_t n = strlen(record);
276+
if (n + 1 < sizeof(record))
277+
{
278+
record[n++] = c;
279+
record[n] = 0;
280+
}
281+
}
282+
static void A(Layer * layer, void * state)
283+
{
284+
auto self = static_cast<TestState *>(state);
285+
self->Record('A');
286+
self->systemLayer.CancelTimer(B, state);
287+
self->systemLayer.CancelTimer(D, state);
288+
}
289+
static void B(Layer * layer, void * state) { static_cast<TestState *>(state)->Record('B'); }
290+
static void C(Layer * layer, void * state)
291+
{
292+
auto self = static_cast<TestState *>(state);
293+
self->Record('C');
294+
self->systemLayer.CancelTimer(E, state);
295+
}
296+
static void D(Layer * layer, void * state) { static_cast<TestState *>(state)->Record('D'); }
297+
static void E(Layer * layer, void * state) { static_cast<TestState *>(state)->Record('E'); }
298+
char record[6] = { 0 };
299+
300+
Layer & systemLayer;
301+
};
302+
TestState testState(systemLayer);
303+
NL_TEST_ASSERT(suite, testState.record[0] == 0);
304+
305+
Clock::ClockBase * const savedClock = &SystemClock();
306+
Clock::Internal::MockClock mockClock;
307+
Clock::Internal::SetSystemClockForTesting(&mockClock);
308+
309+
using namespace Clock::Literals;
310+
systemLayer.StartTimer(0_ms, TestState::A, &testState);
311+
systemLayer.StartTimer(0_ms, TestState::B, &testState);
312+
systemLayer.StartTimer(20_ms, TestState::C, &testState);
313+
systemLayer.StartTimer(30_ms, TestState::D, &testState);
314+
systemLayer.StartTimer(50_ms, TestState::E, &testState);
315+
316+
mockClock.AdvanceMonotonic(100_ms);
317+
LayerEvents<LayerImpl>::ServiceEvents(systemLayer);
318+
NL_TEST_ASSERT(suite, strcmp(testState.record, "AC") == 0);
319+
320+
Clock::Internal::SetSystemClockForTesting(savedClock);
321+
}
322+
260323
// Test the implementation helper classes TimerPool, TimerList, and TimerData.
261324
namespace chip {
262325
namespace System {
@@ -416,6 +479,7 @@ static const nlTest sTests[] =
416479
NL_TEST_DEF("Timer::TestOverflow", CheckOverflow),
417480
NL_TEST_DEF("Timer::TestTimerStarvation", CheckStarvation),
418481
NL_TEST_DEF("Timer::TestTimerOrder", CheckOrder),
482+
NL_TEST_DEF("Timer::TestTimerCancellation", CheckCancellation),
419483
NL_TEST_DEF("Timer::TestTimerPool", chip::System::TestTimer::CheckTimerPool),
420484
NL_TEST_SENTINEL()
421485
};

0 commit comments

Comments
 (0)