Skip to content

Commit d588ccd

Browse files
committed
Chaos Monkey now restarts nodes
Two new modes in chaos monkey: - `node_restart` ocasionally restarts one of the nodes that are initially running (but always just one) - `node_set` maintains an extra set of nodes (which are initially shut down), and prediodically restarts them arbitrarily, ocasionally also wiping out their data folders.
1 parent 63c9aae commit d588ccd

File tree

5 files changed

+143
-63
lines changed

5 files changed

+143
-63
lines changed

Cargo.lock

+3-3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pytest/lib/cluster.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import multiprocessing
12
import threading
23
import subprocess
34
import json
45
import os
6+
import signal
57
import atexit
68
import shutil
79
import requests
@@ -60,22 +62,22 @@ def _get_command_line(self, near_root, node_dir, boot_key, boot_node_addr):
6062
def wait_for_rpc(self, timeout=1):
6163
retry.retry(lambda: self.get_status(), timeout=timeout)
6264

63-
def json_rpc(self, method, params):
65+
def json_rpc(self, method, params, timeout=2):
6466
j = {
6567
'method': method,
6668
'params': params,
6769
'id': 'dontcare',
6870
'jsonrpc': '2.0'
6971
}
70-
r = requests.post("http://%s:%s" % self.rpc_addr(), json=j, timeout=1)
72+
r = requests.post("http://%s:%s" % self.rpc_addr(), json=j, timeout=timeout)
7173
r.raise_for_status()
7274
return json.loads(r.content)
7375

7476
def send_tx(self, signed_tx):
7577
return self.json_rpc('broadcast_tx_async', [base64.b64encode(signed_tx).decode('utf8')])
7678

7779
def get_status(self):
78-
r = requests.get("http://%s:%s/status" % self.rpc_addr(), timeout=1)
80+
r = requests.get("http://%s:%s/status" % self.rpc_addr(), timeout=2)
7981
r.raise_for_status()
8082
return json.loads(r.content)
8183

@@ -106,7 +108,7 @@ def __init__(self, port, rpc_port, near_root, node_dir):
106108
self.node_key = Key.from_json_file(os.path.join(node_dir, "node_key.json"))
107109
self.signer_key = Key.from_json_file(os.path.join(node_dir, "validator_key.json"))
108110

109-
self.handle = None
111+
self.pid = multiprocessing.Value('i', 0)
110112

111113
atexit.register(atexit_cleanup, self)
112114

@@ -124,14 +126,17 @@ def start(self, boot_key, boot_node_addr):
124126
self.stderr = open(os.path.join(self.node_dir, 'stderr'), 'a')
125127
cmd = self._get_command_line(
126128
self.near_root, self.node_dir, boot_key, boot_node_addr)
127-
self.handle = subprocess.Popen(
128-
cmd, stdout=self.stdout, stderr=self.stderr, env=env)
129-
self.wait_for_rpc()
129+
self.pid.value = subprocess.Popen(
130+
cmd, stdout=self.stdout, stderr=self.stderr, env=env).pid
131+
self.wait_for_rpc(5)
130132

131133
def kill(self):
132-
if self.handle is not None:
133-
self.handle.kill()
134-
self.handle = None
134+
if self.pid.value != 0:
135+
os.kill(self.pid.value, signal.SIGKILL)
136+
self.pid.value = 0
137+
138+
def reset_data(self):
139+
shutil.rmtree(os.path.join(self.node_dir, "data"))
135140

136141
def cleanup(self):
137142
self.kill()

pytest/lib/network.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,12 @@ def stop_network(pid):
2525
f.write(str(pid))
2626

2727
def resume_network(pid):
28-
with open('/sys/fs/cgroup/net_cls/tasks', 'w') as f:
29-
f.write(str(pid))
28+
try:
29+
with open('/sys/fs/cgroup/net_cls/tasks', 'w') as f:
30+
f.write(str(pid))
31+
except ProcessLookupError:
32+
# the process was killed in the meantime
33+
pass
3034

3135
if __name__ == "__main__":
3236
import time

pytest/tests/stress/network_stress.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def monkey_network_hammering(stopped, error, nodes, nonces):
3838
s = [False for x in nodes]
3939
while stopped.value == 0:
4040
node_idx = random.randint(0, len(nodes) - 2)
41-
pid = nodes[node_idx].handle.pid
41+
pid = nodes[node_idx].pid.value
4242
if s[node_idx]:
4343
print("Resuming network for process %s" % pid)
4444
resume_network(pid)
@@ -50,7 +50,7 @@ def monkey_network_hammering(stopped, error, nodes, nonces):
5050
time.sleep(0.5)
5151
for i, x in enumerate(s):
5252
if x:
53-
pid = nodes[i].handle.pid
53+
pid = nodes[i].pid.value
5454
print("Resuming network for process %s" % pid)
5555
resume_network(pid)
5656

0 commit comments

Comments
 (0)