Skip to content

Commit 8ca56a6

Browse files
tcdowneycwlbraa
andcommitted
move monit-driven http healthchecking into custom script
* ancient bosh monit will continuously restart us at 5+1 monit cycles whenever CC might be having a hard time getting up within 10 seconds, so this produces more predictable behavior where it only restarts every 5 failed curls. it also doesn't interfere with initial startup. [fixes #125] [fixes #164263056] Co-authored-by: Connor Braa <cbraa@pivotal.io> Co-authored-by: Tim Downey <tdowney@pivotal.io>
1 parent 2068958 commit 8ca56a6

File tree

5 files changed

+70
-35
lines changed

5 files changed

+70
-35
lines changed

jobs/cloud_controller_ng/monit

+8-35
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,22 @@
1-
<%
2-
def discover_external_ip
3-
networks = spec.networks.marshal_dump
4-
5-
_, network = networks.find do |_name, network_spec|
6-
network_spec.default
7-
end
8-
9-
if !network
10-
_, network = networks.first
11-
end
12-
13-
if !network
14-
raise "Could not determine IP via network spec: #{networks}"
15-
end
16-
17-
network.ip
18-
end
19-
%>
201

212
<% if p("bpm.enabled") %>
223

234
check process cloud_controller_ng
245
with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/cloud_controller_ng.pid
256
start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng"
267
stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng"
8+
depends on ccng_monit_http_healthcheck
279
group vcap
2810
if totalmem > <%= p("cc.thresholds.api.alert_if_above_mb") %> Mb for 3 cycles then alert
2911
if totalmem > <%= p("cc.thresholds.api.restart_if_consistently_above_mb") %> Mb for 15 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
3012
if totalmem > <%= p("cc.thresholds.api.restart_if_above_mb") %> Mb for 3 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
31-
<% if p('cc.nginx.ip').empty? %>
32-
if failed host <%= discover_external_ip %> port <%= p("cc.external_port") %> protocol http
33-
<% else %>
34-
if failed host <%= p('cc.nginx.ip') %> port <%= p("cc.external_port") %> protocol http
35-
<% end %>
36-
and request '/v2/info'
37-
with timeout 60 seconds for 5 cycles
38-
then restart
13+
14+
15+
check process ccng_monit_http_healthcheck
16+
with pidfile /var/vcap/sys/run/bpm/cloud_controller_ng/ccng_monit_http_healthcheck.pid
17+
start program "/var/vcap/jobs/bpm/bin/bpm start cloud_controller_ng -p ccng_monit_http_healthcheck"
18+
stop program "/var/vcap/jobs/bpm/bin/bpm stop cloud_controller_ng -p ccng_monit_http_healthcheck"
19+
group vcap
3920

4021
<% (1..(p("cc.jobs.local.number_of_workers"))).each do |index| %>
4122
check process cloud_controller_worker_local_<%= index %>
@@ -75,14 +56,6 @@ check process cloud_controller_ng
7556
if totalmem > <%= p("cc.thresholds.api.alert_if_above_mb") %> Mb for 3 cycles then alert
7657
if totalmem > <%= p("cc.thresholds.api.restart_if_consistently_above_mb") %> Mb for 15 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
7758
if totalmem > <%= p("cc.thresholds.api.restart_if_above_mb") %> Mb for 3 cycles then exec "/var/vcap/jobs/cloud_controller_ng/bin/restart_drain"
78-
<% if p('cc.nginx.ip').empty? %>
79-
if failed host <%= discover_external_ip %> port <%= p("cc.external_port") %> protocol http
80-
<% else %>
81-
if failed host <%= p('cc.nginx.ip') %> port <%= p("cc.external_port") %> protocol http
82-
<% end %>
83-
and request '/v2/info'
84-
with timeout 60 seconds for 5 cycles
85-
then restart
8659

8760
<% (1..(p("cc.jobs.local.number_of_workers"))).each do |index| %>
8861
check process cloud_controller_worker_local_<%= index %>

jobs/cloud_controller_ng/spec

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ templates:
2020
cloud_controller_api_ctl.erb: bin/cloud_controller_ng_ctl
2121
cloud_controller_api_health_check.erb: bin/cloud_controller_ng_health_check
2222
cloud_controller_api_worker_ctl.erb: bin/cloud_controller_worker_ctl
23+
ccng_monit_http_healthcheck.sh.erb: bin/ccng_monit_http_healthcheck
2324
console.erb: bin/console
2425
dns_health_check.erb: bin/dns_health_check
2526
drain.sh.erb: bin/drain

jobs/cloud_controller_ng/templates/bpm.yml.erb

+6
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,18 @@ nginx_maintenance_config = {
7070
"ephemeral_disk" => true,
7171
}
7272

73+
ccng_monit_http_healthcheck_config = {
74+
"name" => "ccng_monit_http_healthcheck",
75+
"executable" => "/var/vcap/jobs/cloud_controller_ng/bin/ccng_monit_http_healthcheck",
76+
}
77+
7378
config = {
7479
"processes" => [
7580
cloud_controller_ng_config,
7681
nginx_config,
7782
nginx_newrelic_plugin_config,
7883
nginx_maintenance_config,
84+
ccng_monit_http_healthcheck_config,
7985
]
8086
}
8187

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env bash
2+
set -e
3+
4+
<%
5+
def discover_external_ip
6+
networks = spec.networks.marshal_dump
7+
8+
_, network = networks.find do |_name, network_spec|
9+
network_spec.default
10+
end
11+
12+
if !network
13+
_, network = networks.first
14+
end
15+
16+
if !network
17+
raise "Could not determine IP via network spec: #{networks}"
18+
end
19+
20+
network.ip
21+
end
22+
%>
23+
24+
function log_failure {
25+
echo "$(date --rfc-3339=seconds) :: Healthcheck failed consistently, restarting CC"
26+
}
27+
28+
HOST=<%= p('cc.nginx.ip').empty? ? discover_external_ip : p('cc.nginx.ip') %>
29+
PORT=<%= p("cc.external_port") %>
30+
URL="http://${HOST}:${PORT}/v2/info"
31+
32+
source /var/vcap/packages/capi_utils/monit_utils.sh
33+
34+
echo 'Waiting for Cloud Controller to initially become healthy'
35+
36+
wait_for_server_to_become_healthy "${URL}" "<%= p("cc.api_post_start_healthcheck_timeout_in_seconds") %>"
37+
38+
echo 'Initial check passed, will now restart CC over on repeated failures'
39+
40+
trap log_failure EXIT
41+
42+
# if we fail to curl it 5 times in a row across 50 seconds, die so monit will restart us
43+
set -e
44+
while true; do
45+
curl \
46+
-sS \
47+
--max-time <%= p('cc.api_health_check_timeout_per_retry') %> \
48+
--retry 5 \
49+
--retry-delay 10 \
50+
-A "ccng_monit_http_healthcheck" \
51+
"${URL}" > /dev/null
52+
sleep 10
53+
done

jobs/cloud_controller_ng/templates/cloud_controller_api_health_check.erb

+2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/usr/bin/env bash
22

3+
# this script is ran by the route-registrar
4+
35
curl \
46
--max-time <%= p('cc.api_health_check_timeout_per_retry') %> \
57
--retry-max-time <%= p('cc.api_health_check_total_timeout') %> \

0 commit comments

Comments
 (0)