Skip to content

Commit 43561e9

Browse files
committed
Awareness attribute decommission backports (opensearch-project#4970)
* Add DecommissionService and helper to execute awareness attribute decommissioning opensearch-project#4084 * Add APIs (GET/PUT) to decommission awareness attribute opensearch-project#4261 * Controlling discovery for decommissioned nodes opensearch-project#4590 * Fix decommission status update to non leader nodes opensearch-project#4800 * Remove redundant field from GetDecommissionStateResponse opensearch-project#4751 * Service Layer changes for Recommission API opensearch-project#4320 * Recommission api level support opensearch-project#4604 * Fix bug in AwarenessAttributeDecommissionIT opensearch-project#4822 Signed-off-by: Rishab Nahata <rnnahata@amazon.com>
1 parent 017f76b commit 43561e9

File tree

67 files changed

+4418
-11
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+4418
-11
lines changed

CHANGELOG.md

+8
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
2525
- Added missing no-jdk distributions ([#4722](https://github.com/opensearch-project/OpenSearch/pull/4722))
2626
- Copy `build.sh` over from opensearch-build ([#4887](https://github.com/opensearch-project/OpenSearch/pull/4887))
2727
- Update GeoGrid base class access modifier to support extensibility ([#4921](https://github.com/opensearch-project/OpenSearch/pull/4921))
28+
- Recommission API changes for service layer ([#4320](https://github.com/opensearch-project/OpenSearch/pull/4320))
29+
- Recommissioning of zone. REST layer support. ([#4624](https://github.com/opensearch-project/OpenSearch/pull/4604))
2830
- Build no-jdk distributions as part of release build ([#4902](https://github.com/opensearch-project/OpenSearch/pull/4902))
2931
- Use getParameterCount instead of getParameterTypes ([#4821](https://github.com/opensearch-project/OpenSearch/pull/4821))
3032
- Added in-flight cancellation of SearchShardTask based on resource consumption ([#4565](https://github.com/opensearch-project/OpenSearch/pull/4565))
@@ -64,6 +66,9 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
6466
- GET api for weighted shard routing([#4275](https://github.com/opensearch-project/OpenSearch/pull/4275/))
6567
- Delete api for weighted shard routing([#4400](https://github.com/opensearch-project/OpenSearch/pull/4400/))
6668
- Fix weighted routing metadata deserialization error on process restart ([#4691](https://github.com/opensearch-project/OpenSearch/pull/4691))
69+
- Add DecommissionService and helper to execute awareness attribute decommissioning ([#4084](https://github.com/opensearch-project/OpenSearch/pull/4084))
70+
- Add APIs (GET/PUT) to decommission awareness attribute ([#4261](https://github.com/opensearch-project/OpenSearch/pull/4261))
71+
- Controlling discovery for decommissioned nodes ([#4590](https://github.com/opensearch-project/OpenSearch/pull/4590))
6772

6873
### Deprecated
6974
### Removed
@@ -89,9 +94,12 @@ Inspired from [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
8994
- [Bug]: Fixed invalid location of JDK dependency for arm64 architecture([#4613](https://github.com/opensearch-project/OpenSearch/pull/4613))
9095
- [Bug]: Alias filter lost after rollover ([#4499](https://github.com/opensearch-project/OpenSearch/pull/4499))
9196
- Fixing Gradle warnings associated with publishPluginZipPublicationToXxx tasks ([#4696](https://github.com/opensearch-project/OpenSearch/pull/4696))
97+
- [BUG]: Remove redundant field from GetDecommissionStateResponse ([#4751](https://github.com/opensearch-project/OpenSearch/pull/4751))
9298
- Fixed randomly failing test ([4774](https://github.com/opensearch-project/OpenSearch/pull/4774))
9399
- Fix recovery path for searchable snapshots ([4813](https://github.com/opensearch-project/OpenSearch/pull/4813))
94100
- Fix a bug on handling an invalid array value for point type field #4900([#4900](https://github.com/opensearch-project/OpenSearch/pull/4900))
101+
- Fix decommission status update to non leader nodes ([4800](https://github.com/opensearch-project/OpenSearch/pull/4800))
102+
- Fix bug in AwarenessAttributeDecommissionIT([4822](https://github.com/opensearch-project/OpenSearch/pull/4822))
95103
- Fix for failing checkExtraction, checkLicense and checkNotice tasks for windows gradle check ([#4941](https://github.com/opensearch-project/OpenSearch/pull/4941))
96104
### Security
97105
- CVE-2022-25857 org.yaml:snakeyaml DOS vulnerability ([#4341](https://github.com/opensearch-project/OpenSearch/pull/4341))

client/rest-high-level/src/test/java/org/opensearch/client/RestHighLevelClientTests.java

+4-1
Original file line numberDiff line numberDiff line change
@@ -891,7 +891,10 @@ public void testApiNamingConventions() throws Exception {
891891
"remote_store.restore",
892892
"cluster.put_weighted_routing",
893893
"cluster.get_weighted_routing",
894-
"cluster.delete_weighted_routing", };
894+
"cluster.delete_weighted_routing",
895+
"cluster.put_decommission_awareness",
896+
"cluster.get_decommission_awareness",
897+
"cluster.delete_decommission_awareness", };
895898
List<String> booleanReturnMethods = Arrays.asList("security.enable_user", "security.disable_user", "security.change_password");
896899
Set<String> deprecatedMethods = new HashSet<>();
897900
deprecatedMethods.add("indices.force_merge");
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
{
2+
"cluster.delete_decommission_awareness": {
3+
"documentation": {
4+
"url": "https://opensearch.org/docs/latest/opensearch/rest-api/decommission/",
5+
"description": "Delete any existing decommission."
6+
},
7+
"stability": "experimental",
8+
"url": {
9+
"paths": [
10+
{
11+
"path": "/_cluster/decommission/awareness/",
12+
"methods": [
13+
"DELETE"
14+
]
15+
}
16+
]
17+
}
18+
}
19+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{
2+
"cluster.get_decommission_awareness": {
3+
"documentation": {
4+
"url": "https://opensearch.org/docs/latest/opensearch/rest-api/decommission/",
5+
"description": "Get details and status of decommissioned attribute"
6+
},
7+
"stability": "experimental",
8+
"url": {
9+
"paths": [
10+
{
11+
"path":"/_cluster/decommission/awareness/{awareness_attribute_name}/_status",
12+
"methods":[
13+
"GET"
14+
],
15+
"parts":{
16+
"awareness_attribute_name":{
17+
"type":"string",
18+
"description":"Awareness attribute name"
19+
}
20+
}
21+
}
22+
]
23+
}
24+
}
25+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"cluster.put_decommission_awareness": {
3+
"documentation": {
4+
"url": "https://opensearch.org/docs/latest/opensearch/rest-api/decommission/",
5+
"description": "Decommissions an awareness attribute"
6+
},
7+
"stability": "experimental",
8+
"url": {
9+
"paths": [
10+
{
11+
"path": "/_cluster/decommission/awareness/{awareness_attribute_name}/{awareness_attribute_value}",
12+
"methods": [
13+
"PUT"
14+
],
15+
"parts": {
16+
"awareness_attribute_name": {
17+
"type": "string",
18+
"description": "Awareness attribute name"
19+
},
20+
"awareness_attribute_value": {
21+
"type": "string",
22+
"description": "Awareness attribute value"
23+
}
24+
}
25+
}
26+
]
27+
}
28+
}
29+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package org.opensearch.cluster.coordination;
10+
11+
import org.apache.logging.log4j.LogManager;
12+
import org.apache.logging.log4j.Logger;
13+
import org.junit.After;
14+
import org.opensearch.action.admin.cluster.decommission.awareness.delete.DeleteDecommissionStateAction;
15+
import org.opensearch.action.admin.cluster.decommission.awareness.delete.DeleteDecommissionStateRequest;
16+
import org.opensearch.action.admin.cluster.decommission.awareness.delete.DeleteDecommissionStateResponse;
17+
import org.opensearch.action.admin.cluster.decommission.awareness.get.GetDecommissionStateAction;
18+
import org.opensearch.action.admin.cluster.decommission.awareness.get.GetDecommissionStateRequest;
19+
import org.opensearch.action.admin.cluster.decommission.awareness.get.GetDecommissionStateResponse;
20+
import org.opensearch.action.admin.cluster.decommission.awareness.put.DecommissionAction;
21+
import org.opensearch.action.admin.cluster.decommission.awareness.put.DecommissionRequest;
22+
import org.opensearch.action.admin.cluster.decommission.awareness.put.DecommissionResponse;
23+
import org.opensearch.cluster.ClusterState;
24+
import org.opensearch.cluster.decommission.DecommissionAttribute;
25+
import org.opensearch.cluster.decommission.DecommissionStatus;
26+
import org.opensearch.cluster.node.DiscoveryNode;
27+
import org.opensearch.cluster.node.DiscoveryNodeRole;
28+
import org.opensearch.cluster.service.ClusterService;
29+
import org.opensearch.common.Priority;
30+
import org.opensearch.common.settings.Settings;
31+
import org.opensearch.common.unit.TimeValue;
32+
import org.opensearch.plugins.Plugin;
33+
import org.opensearch.test.OpenSearchIntegTestCase;
34+
import org.opensearch.test.transport.MockTransportService;
35+
36+
import java.util.Collection;
37+
import java.util.Collections;
38+
import java.util.Iterator;
39+
import java.util.List;
40+
import java.util.concurrent.ExecutionException;
41+
42+
import static org.opensearch.test.NodeRoles.onlyRole;
43+
import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertNoTimeout;
44+
45+
@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.TEST, numDataNodes = 0)
46+
public class AwarenessAttributeDecommissionIT extends OpenSearchIntegTestCase {
47+
private final Logger logger = LogManager.getLogger(AwarenessAttributeDecommissionIT.class);
48+
49+
@Override
50+
protected Collection<Class<? extends Plugin>> nodePlugins() {
51+
return Collections.singletonList(MockTransportService.TestPlugin.class);
52+
}
53+
54+
@After
55+
public void cleanup() throws Exception {
56+
assertNoTimeout(client().admin().cluster().prepareHealth().get());
57+
}
58+
59+
public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionException, InterruptedException {
60+
Settings commonSettings = Settings.builder()
61+
.put("cluster.routing.allocation.awareness.attributes", "zone")
62+
.put("cluster.routing.allocation.awareness.force.zone.values", "a,b,c")
63+
.build();
64+
65+
logger.info("--> start 3 cluster manager nodes on zones 'a' & 'b' & 'c'");
66+
List<String> clusterManagerNodes = internalCluster().startNodes(
67+
Settings.builder()
68+
.put(commonSettings)
69+
.put("node.attr.zone", "a")
70+
.put(onlyRole(commonSettings, DiscoveryNodeRole.CLUSTER_MANAGER_ROLE))
71+
.build(),
72+
Settings.builder()
73+
.put(commonSettings)
74+
.put("node.attr.zone", "b")
75+
.put(onlyRole(commonSettings, DiscoveryNodeRole.CLUSTER_MANAGER_ROLE))
76+
.build(),
77+
Settings.builder()
78+
.put(commonSettings)
79+
.put("node.attr.zone", "c")
80+
.put(onlyRole(commonSettings, DiscoveryNodeRole.CLUSTER_MANAGER_ROLE))
81+
.build()
82+
);
83+
84+
logger.info("--> start 3 data nodes on zones 'a' & 'b' & 'c'");
85+
List<String> dataNodes = internalCluster().startNodes(
86+
Settings.builder()
87+
.put(commonSettings)
88+
.put("node.attr.zone", "a")
89+
.put(onlyRole(commonSettings, DiscoveryNodeRole.DATA_ROLE))
90+
.build(),
91+
Settings.builder()
92+
.put(commonSettings)
93+
.put("node.attr.zone", "b")
94+
.put(onlyRole(commonSettings, DiscoveryNodeRole.DATA_ROLE))
95+
.build(),
96+
Settings.builder()
97+
.put(commonSettings)
98+
.put("node.attr.zone", "c")
99+
.put(onlyRole(commonSettings, DiscoveryNodeRole.DATA_ROLE))
100+
.build()
101+
);
102+
103+
ensureStableCluster(6);
104+
105+
logger.info("--> starting decommissioning nodes in zone {}", 'c');
106+
DecommissionAttribute decommissionAttribute = new DecommissionAttribute("zone", "c");
107+
DecommissionRequest decommissionRequest = new DecommissionRequest(decommissionAttribute);
108+
DecommissionResponse decommissionResponse = client().execute(DecommissionAction.INSTANCE, decommissionRequest).get();
109+
assertTrue(decommissionResponse.isAcknowledged());
110+
111+
// Will wait for all events to complete
112+
client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
113+
114+
// assert that decommission status is successful
115+
GetDecommissionStateResponse response = client().execute(
116+
GetDecommissionStateAction.INSTANCE,
117+
new GetDecommissionStateRequest(decommissionAttribute.attributeName())
118+
).get();
119+
assertEquals(response.getAttributeValue(), decommissionAttribute.attributeValue());
120+
assertEquals(response.getDecommissionStatus(), DecommissionStatus.SUCCESSFUL);
121+
122+
ClusterState clusterState = client(clusterManagerNodes.get(0)).admin().cluster().prepareState().execute().actionGet().getState();
123+
assertEquals(4, clusterState.nodes().getSize());
124+
125+
// assert status on nodes that are part of cluster currently
126+
Iterator<DiscoveryNode> discoveryNodeIterator = clusterState.nodes().getNodes().valuesIt();
127+
while (discoveryNodeIterator.hasNext()) {
128+
// assert no node has decommissioned attribute
129+
DiscoveryNode node = discoveryNodeIterator.next();
130+
assertNotEquals(node.getAttributes().get("zone"), "c");
131+
132+
// assert all the nodes has status as SUCCESSFUL
133+
ClusterService localNodeClusterService = internalCluster().getInstance(ClusterService.class, node.getName());
134+
assertEquals(
135+
localNodeClusterService.state().metadata().decommissionAttributeMetadata().status(),
136+
DecommissionStatus.SUCCESSFUL
137+
);
138+
}
139+
140+
// assert status on decommissioned node
141+
// Here we will verify that until it got kicked out, it received appropriate status updates
142+
// decommissioned nodes hence will have status as IN_PROGRESS as it will be kicked out later after this
143+
// and won't receive status update to SUCCESSFUL
144+
String randomDecommissionedNode = randomFrom(clusterManagerNodes.get(2), dataNodes.get(2));
145+
ClusterService decommissionedNodeClusterService = internalCluster().getInstance(ClusterService.class, randomDecommissionedNode);
146+
assertEquals(
147+
decommissionedNodeClusterService.state().metadata().decommissionAttributeMetadata().status(),
148+
DecommissionStatus.IN_PROGRESS
149+
);
150+
151+
// Will wait for all events to complete
152+
client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
153+
154+
// Recommissioning the zone back to gracefully succeed the test once above tests succeeds
155+
DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(clusterManagerNodes.get(0)).execute(
156+
DeleteDecommissionStateAction.INSTANCE,
157+
new DeleteDecommissionStateRequest()
158+
).get();
159+
assertTrue(deleteDecommissionStateResponse.isAcknowledged());
160+
161+
// will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes)
162+
// as by then all nodes should have joined the cluster
163+
ensureStableCluster(6, TimeValue.timeValueMinutes(2));
164+
}
165+
}

server/src/main/java/org/opensearch/OpenSearchException.java

+13
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@
6969
import static java.util.Collections.unmodifiableMap;
7070
import static org.opensearch.Version.V_2_1_0;
7171
import static org.opensearch.Version.V_2_3_0;
72+
import static org.opensearch.Version.V_2_4_0;
7273
import static org.opensearch.cluster.metadata.IndexMetadata.INDEX_UUID_NA_VALUE;
7374
import static org.opensearch.common.xcontent.XContentParserUtils.ensureExpectedToken;
7475
import static org.opensearch.common.xcontent.XContentParserUtils.ensureFieldName;
@@ -1608,6 +1609,18 @@ private enum OpenSearchExceptionHandle {
16081609
org.opensearch.index.shard.PrimaryShardClosedException::new,
16091610
162,
16101611
V_2_3_0
1612+
),
1613+
DECOMMISSIONING_FAILED_EXCEPTION(
1614+
org.opensearch.cluster.decommission.DecommissioningFailedException.class,
1615+
org.opensearch.cluster.decommission.DecommissioningFailedException::new,
1616+
163,
1617+
V_2_4_0
1618+
),
1619+
NODE_DECOMMISSIONED_EXCEPTION(
1620+
org.opensearch.cluster.decommission.NodeDecommissionedException.class,
1621+
org.opensearch.cluster.decommission.NodeDecommissionedException::new,
1622+
164,
1623+
V_2_4_0
16111624
);
16121625

16131626
final Class<? extends OpenSearchException> exceptionClass;

server/src/main/java/org/opensearch/action/ActionModule.java

+16
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@
4040
import org.opensearch.action.admin.cluster.configuration.ClearVotingConfigExclusionsAction;
4141
import org.opensearch.action.admin.cluster.configuration.TransportAddVotingConfigExclusionsAction;
4242
import org.opensearch.action.admin.cluster.configuration.TransportClearVotingConfigExclusionsAction;
43+
import org.opensearch.action.admin.cluster.decommission.awareness.get.GetDecommissionStateAction;
44+
import org.opensearch.action.admin.cluster.decommission.awareness.get.TransportGetDecommissionStateAction;
45+
import org.opensearch.action.admin.cluster.decommission.awareness.delete.DeleteDecommissionStateAction;
46+
import org.opensearch.action.admin.cluster.decommission.awareness.delete.TransportDeleteDecommissionStateAction;
47+
import org.opensearch.action.admin.cluster.decommission.awareness.put.DecommissionAction;
48+
import org.opensearch.action.admin.cluster.decommission.awareness.put.TransportDecommissionAction;
4349
import org.opensearch.action.admin.cluster.health.ClusterHealthAction;
4450
import org.opensearch.action.admin.cluster.health.TransportClusterHealthAction;
4551
import org.opensearch.action.admin.cluster.node.hotthreads.NodesHotThreadsAction;
@@ -309,9 +315,11 @@
309315
import org.opensearch.rest.action.admin.cluster.RestClusterStatsAction;
310316
import org.opensearch.rest.action.admin.cluster.RestClusterUpdateSettingsAction;
311317
import org.opensearch.rest.action.admin.cluster.RestCreateSnapshotAction;
318+
import org.opensearch.rest.action.admin.cluster.RestDeleteDecommissionStateAction;
312319
import org.opensearch.rest.action.admin.cluster.RestDeleteRepositoryAction;
313320
import org.opensearch.rest.action.admin.cluster.RestDeleteSnapshotAction;
314321
import org.opensearch.rest.action.admin.cluster.RestDeleteStoredScriptAction;
322+
import org.opensearch.rest.action.admin.cluster.RestGetDecommissionStateAction;
315323
import org.opensearch.rest.action.admin.cluster.RestGetRepositoriesAction;
316324
import org.opensearch.rest.action.admin.cluster.RestGetScriptContextAction;
317325
import org.opensearch.rest.action.admin.cluster.RestGetScriptLanguageAction;
@@ -324,6 +332,7 @@
324332
import org.opensearch.rest.action.admin.cluster.RestNodesStatsAction;
325333
import org.opensearch.rest.action.admin.cluster.RestNodesUsageAction;
326334
import org.opensearch.rest.action.admin.cluster.RestPendingClusterTasksAction;
335+
import org.opensearch.rest.action.admin.cluster.RestDecommissionAction;
327336
import org.opensearch.rest.action.admin.cluster.RestPutRepositoryAction;
328337
import org.opensearch.rest.action.admin.cluster.RestPutStoredScriptAction;
329338
import org.opensearch.rest.action.admin.cluster.RestReloadSecureSettingsAction;
@@ -694,6 +703,10 @@ public <Request extends ActionRequest, Response extends ActionResponse> void reg
694703
actions.register(PitSegmentsAction.INSTANCE, TransportPitSegmentsAction.class);
695704
actions.register(GetAllPitsAction.INSTANCE, TransportGetAllPitsAction.class);
696705

706+
// Decommission actions
707+
actions.register(DecommissionAction.INSTANCE, TransportDecommissionAction.class);
708+
actions.register(GetDecommissionStateAction.INSTANCE, TransportGetDecommissionStateAction.class);
709+
actions.register(DeleteDecommissionStateAction.INSTANCE, TransportDeleteDecommissionStateAction.class);
697710
return unmodifiableMap(actions.getRegistry());
698711
}
699712

@@ -875,6 +888,7 @@ public void initRestHandlers(Supplier<DiscoveryNodes> nodesInCluster) {
875888
registerHandler.accept(new RestDeletePitAction());
876889
registerHandler.accept(new RestGetAllPitsAction(nodesInCluster));
877890
registerHandler.accept(new RestPitSegmentsAction(nodesInCluster));
891+
registerHandler.accept(new RestDeleteDecommissionStateAction());
878892

879893
for (ActionPlugin plugin : actionPlugins) {
880894
for (RestHandler handler : plugin.getRestHandlers(
@@ -890,6 +904,8 @@ public void initRestHandlers(Supplier<DiscoveryNodes> nodesInCluster) {
890904
}
891905
}
892906
registerHandler.accept(new RestCatAction(catActions));
907+
registerHandler.accept(new RestDecommissionAction());
908+
registerHandler.accept(new RestGetDecommissionStateAction());
893909

894910
// Remote Store APIs
895911
if (FeatureFlags.isEnabled(FeatureFlags.REMOTE_STORE)) {

0 commit comments

Comments
 (0)