44
44
import org .opensearch .common .Randomness ;
45
45
import org .opensearch .common .settings .Settings ;
46
46
import org .opensearch .repositories .RepositoriesService ;
47
+ import org .opensearch .repositories .Repository ;
48
+ import org .opensearch .repositories .RepositoryMissingException ;
47
49
import org .opensearch .repositories .fs .ReloadableFsRepository ;
48
50
import org .opensearch .test .OpenSearchIntegTestCase ;
49
51
import org .opensearch .test .disruption .NetworkDisruption ;
57
59
import java .util .Arrays ;
58
60
import java .util .HashSet ;
59
61
import java .util .List ;
62
+ import java .util .Objects ;
60
63
import java .util .Set ;
61
64
import java .util .concurrent .CountDownLatch ;
62
65
import java .util .stream .Collectors ;
@@ -261,7 +264,9 @@ public void testNodeNotReachableFromClusterManager() throws Exception {
261
264
}
262
265
263
266
/**
264
- * Test Repositories Configured Node Join Commit failures.
267
+ * Tests the scenario where-in a cluster-state containing new repository meta-data as part of a node-join from a
268
+ * repository-configured node fails on a commit stag and has a master switch. This would lead to master nodes
269
+ * doing another round of node-joins with the new cluster-state as the previous attempt had a successful publish.
265
270
*/
266
271
public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitFails () throws Exception {
267
272
final String remoteStateRepoName = "remote-state-repo" ;
@@ -288,20 +293,32 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF
288
293
clusterManagerNode
289
294
);
290
295
logger .info ("Blocking Cluster Manager Commit Request on all nodes" );
296
+ // This is to allow the new node to have commit failures on the nodes in the send path itself. This will lead to the
297
+ // nodes have a successful publish operation but failed commit operation. This will come into play once the new node joins
291
298
nonClusterManagerNodes .forEach (node -> {
292
299
TransportService targetTransportService = internalCluster ().getInstance (TransportService .class , node );
293
- clusterManagerTransportService .addOpenSearchFailureException (
294
- targetTransportService ,
295
- new FailedToCommitClusterStateException ("Blocking Commit" ),
296
- PublicationTransportHandler .COMMIT_STATE_ACTION_NAME
297
- );
300
+ clusterManagerTransportService .addSendBehavior (targetTransportService , (connection , requestId , action , request , options ) -> {
301
+ if (action .equals (PublicationTransportHandler .COMMIT_STATE_ACTION_NAME )) {
302
+ logger .info ("--> preventing {} request" , PublicationTransportHandler .COMMIT_STATE_ACTION_NAME );
303
+ throw new FailedToCommitClusterStateException ("Blocking Commit" );
304
+ }
305
+ connection .sendRequest (requestId , action , request , options );
306
+ });
298
307
});
299
308
300
309
logger .info ("Starting Node with remote publication settings" );
310
+ // Start a node with remote-publication repositories configured. This will lead to the active cluster-manager create
311
+ // a new cluster-state event with the new node-join along with new repositories setup in the cluster meta-data.
301
312
internalCluster ().startDataOnlyNodes (1 , remotePublicationSettings , Boolean .TRUE );
302
313
303
314
logger .info ("Stopping current Cluster Manager" );
315
+ // We stop the current cluster-manager whose outbound paths were blocked. This is to force a new election onto nodes
316
+ // we had the new cluster-state published but not commited.
304
317
internalCluster ().stopCurrentClusterManagerNode ();
318
+
319
+ // We expect that the repositories validations are skipped in this case and node-joins succeeds as expected. The
320
+ // repositories validations are skipped because even though the cluster-state is updated in the persisted registry,
321
+ // the repository service will not be updated as the commit attempt failed.
305
322
ensureStableCluster (6 );
306
323
307
324
String randomNode = nonClusterManagerNodes .get (Randomness .get ().nextInt (nonClusterManagerNodes .size ()));
@@ -330,11 +347,22 @@ public void testElectClusterManagerRemotePublicationConfigurationNodeJoinCommitF
330
347
331
348
RepositoriesService repositoriesService = internalCluster ().getInstance (RepositoriesService .class , randomNode );
332
349
333
- if (repositoriesService .isRepositoryPresent (remoteStateRepoName )) {
334
- isRemoteStateRepoConfigured = Boolean .TRUE ;
350
+ try {
351
+ Repository remoteStateRepo = repositoriesService .repository (remoteStateRepoName );
352
+ if (Objects .nonNull (remoteStateRepo )) {
353
+ isRemoteStateRepoConfigured = Boolean .TRUE ;
354
+ }
355
+ } catch (RepositoryMissingException e ) {
356
+ isRemoteStateRepoConfigured = Boolean .FALSE ;
335
357
}
336
- if (repositoriesService .isRepositoryPresent (remoteRoutingTableRepoName )) {
337
- isRemoteRoutingTableRepoConfigured = Boolean .TRUE ;
358
+
359
+ try {
360
+ Repository routingTableRepo = repositoriesService .repository (remoteRoutingTableRepoName );
361
+ if (Objects .nonNull (routingTableRepo )) {
362
+ isRemoteRoutingTableRepoConfigured = Boolean .TRUE ;
363
+ }
364
+ } catch (RepositoryMissingException e ) {
365
+ isRemoteRoutingTableRepoConfigured = Boolean .FALSE ;
338
366
}
339
367
340
368
Assert .assertTrue ("RemoteState Repo is not set in RepositoryService" , isRemoteStateRepoConfigured );
0 commit comments