Skip to content
This repository was archived by the owner on Mar 4, 2024. It is now read-only.

Commit f727712

Browse files
committed
tick: Don't convert to candidate while a snapshot is installing
Signed-off-by: Cole Miller <cole.miller@canonical.com>
1 parent 3c69054 commit f727712

File tree

3 files changed

+50
-3
lines changed

3 files changed

+50
-3
lines changed

src/replication.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -1201,6 +1201,9 @@ static void installSnapshotCb(struct raft_io_snapshot_put *req, int status)
12011201
struct raft_append_entries_result result;
12021202
int rv;
12031203

1204+
/* We avoid converting to candidate state while installing a snapshot. */
1205+
assert(r->state == RAFT_FOLLOWER || r->state == RAFT_UNAVAILABLE);
1206+
12041207
r->snapshot.put.data = NULL;
12051208

12061209
result.term = r->current_term;
@@ -1246,9 +1249,6 @@ static void installSnapshotCb(struct raft_io_snapshot_put *req, int status)
12461249
raft_configuration_close(&snapshot->configuration);
12471250

12481251
respond:
1249-
/* TODO Investigate when and if a RAFT_FOLLOWER moves to RAFT_CANDIDATE
1250-
* during the installation of a snapshot.
1251-
* See https://github.com/canonical/raft/issues/343 */
12521252
if (r->state == RAFT_FOLLOWER) {
12531253
result.last_log_index = r->last_stored;
12541254
sendAppendEntriesResult(r, &result);

src/tick.c

+5
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ static int tickFollower(struct raft *r)
4141
* current leader or granting vote to candidate, convert to candidate.
4242
*/
4343
if (electionTimerExpired(r) && server->role == RAFT_VOTER) {
44+
if (r->snapshot.put.data != NULL) {
45+
tracef("installing snapshot -> don't convert to candidate");
46+
electionResetTimer(r);
47+
return 0;
48+
}
4449
tracef("convert to candidate and start new election");
4550
rv = convertToCandidate(r, false /* disrupt leader */);
4651
if (rv != 0) {

test/integration/test_snapshot.c

+42
Original file line numberDiff line numberDiff line change
@@ -717,3 +717,45 @@ TEST(snapshot, takeSnapshotFail, setUp, tearDown, 0, fsm_snapshot_async_params)
717717
/* No crash or leaks have occurred */
718718
return MUNIT_OK;
719719
}
720+
721+
/* A follower doesn't convert to candidate state while it's installing a snapshot. */
722+
TEST(snapshot, snapshotBlocksCandidate, setUp, tearDown, 0, NULL)
723+
{
724+
struct fixture *f = data;
725+
(void)params;
726+
727+
/* Set very low threshold and trailing entries number */
728+
SET_SNAPSHOT_THRESHOLD(3);
729+
SET_SNAPSHOT_TRAILING(1);
730+
SET_SNAPSHOT_TIMEOUT(200);
731+
732+
raft_set_election_timeout(CLUSTER_RAFT(2), 600);
733+
734+
/* Apply a few of entries, to force a snapshot to be taken. Drop all network
735+
* traffic between servers 0 and 2 in order for AppendEntries RPCs to not be
736+
* replicated */
737+
fprintf(stderr, "saturating\n");
738+
CLUSTER_SATURATE_BOTHWAYS(0, 2);
739+
CLUSTER_MAKE_PROGRESS;
740+
CLUSTER_MAKE_PROGRESS;
741+
CLUSTER_MAKE_PROGRESS;
742+
743+
/* Reconnect both servers and set a high disk latency on server 2 */
744+
CLUSTER_SET_DISK_LATENCY(2, 2000);
745+
CLUSTER_DESATURATE_BOTHWAYS(0, 2);
746+
fprintf(stderr, "desaturating\n");
747+
748+
/* Wait a while and check that the leader has sent a snapshot */
749+
CLUSTER_STEP_UNTIL_ELAPSED(500);
750+
munit_assert_int(CLUSTER_N_SEND(0, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
751+
munit_assert_int(CLUSTER_N_RECV(2, RAFT_IO_INSTALL_SNAPSHOT), ==, 1);
752+
753+
/* Disconnect the servers again so that heartbeats, etc. won't arrive */
754+
fprintf(stderr, "resaturating\n");
755+
CLUSTER_SATURATE_BOTHWAYS(0, 2);
756+
munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER);
757+
munit_assert_ptr(CLUSTER_RAFT(2)->snapshot.put.data, !=, NULL);
758+
CLUSTER_STEP_UNTIL_ELAPSED(1500);
759+
munit_assert_int(CLUSTER_STATE(2), ==, RAFT_FOLLOWER);
760+
return MUNIT_OK;
761+
}

0 commit comments

Comments
 (0)