Fix swarm bay failure reporting

The old method of detecting failures was very likely to fail in
many cases because it relied on all bay services being started by
the time cloud-init finished. This is a problem because the bay
services are started asynchronously and can take quite a while to
start.

The new method relies on systemd's OnFailure directive to kick off
specific service units when a failure is detected. Both the swarm
agent and manager have their own failure service so that we are
not overloading a single wait condition with multiple potential
failures.

Change-Id: I7ce4be567517fe948dde0ac7225996967196c9e8
Closes-bug: #1502329
This commit is contained in:
Andrew Melton
2015-10-02 20:49:30 +00:00
parent 9dd29bbb48
commit 156e315e98
6 changed files with 77 additions and 33 deletions

View File

@@ -7,22 +7,6 @@ echo "notifying heat"
STATUS="SUCCESS"
REASON="Setup complete"
DATA="OK"
FAILED_SERVICE=""
for service in $NODE_SERVICES; do
echo "checking service status for $service"
systemctl status $service
if [[ $? -ne 0 ]]; then
echo "$service is not active, the cluster is not valid"
FAILED_SERVICE="$FAILED_SERVICE $service"
fi
done
if [[ -n $FAILED_SERVICE ]]; then
STATUS="FAILURE"
REASON="Setup failed, $FAILED_SERVICE not start up correctly."
DATA="Failed"
fi
data=$(echo '{"Status": "'${STATUS}'", "Reason": "'$REASON'", "Data": "'${DATA}'", "UniqueId": "00000"}')

View File

@@ -0,0 +1,16 @@
#cloud-config
merge_how: dict(recurse_array)+list(append)
write_files:
- path: /etc/systemd/system/$SERVICE-failure.service
owner: "root:root"
permissions: "0644"
content: |
[Unit]
Description=$SERVICE Failure Notifier
[Service]
Type=simple
TimeoutStartSec=0
ExecStart=/usr/bin/curl -sf -X PUT -H 'Content-Type: application/json' \
--data-binary '{"Status": "FAILURE", "Reason": "$SERVICE service failed to start.", "Data": "OK", "UniqueId": "00000"}' \
"$WAIT_HANDLE"

View File

@@ -9,12 +9,13 @@ write_files:
Description=Swarm Agent
After=docker.service
Requires=docker.service
OnFailure=swarm-agent-failure.service
[Service]
TimeoutStartSec=0
ExecStartPre=-/usr/bin/docker kill swarm-agent
ExecStartPre=-/usr/bin/docker rm swarm-agent
ExecStartPre=/usr/bin/docker pull swarm:0.2.0
ExecStartPre=-/usr/bin/docker pull swarm:0.2.0
#TODO: roll-back from swarm:0.2.0 to swarm if atomic image can work with latest swarm image
ExecStart=/usr/bin/docker run -e http_proxy=$HTTP_PROXY -e https_proxy=$HTTPS_PROXY -e no_proxy=$NO_PROXY --name swarm-agent swarm:0.2.0 join --addr $NODE_IP:2375 $DISCOVERY_URL
ExecStop=/usr/bin/docker stop swarm-agent

View File

@@ -5,12 +5,13 @@ cat > /etc/systemd/system/swarm-manager.service << END_SERVICE_TOP
Description=Swarm Manager
After=docker.service
Requires=docker.service
OnFailure=swarm-manager-failure.service
[Service]
TimeoutStartSec=0
ExecStartPre=-/usr/bin/docker kill swarm-manager
ExecStartPre=-/usr/bin/docker rm swarm-manager
ExecStartPre=/usr/bin/docker pull swarm:0.2.0
ExecStartPre=-/usr/bin/docker pull swarm:0.2.0
#TODO: roll-back from swarm:0.2.0 to swarm if atomic image can work with latest swarm image
ExecStart=/usr/bin/docker run --name swarm-manager \\
-v /etc/docker:/etc/docker \\

View File

@@ -87,6 +87,18 @@ parameters:
resources:
cloud_init_wait_handle:
type: "AWS::CloudFormation::WaitConditionHandle"
cloud_init_wait_condition:
type: "AWS::CloudFormation::WaitCondition"
depends_on:
- swarm_master
properties:
Handle:
get_resource: cloud_init_wait_handle
Timeout: 6000
master_wait_handle:
type: "AWS::CloudFormation::WaitConditionHandle"
@@ -179,7 +191,7 @@ resources:
str_replace:
template: {get_file: fragments/write-heat-params.yaml}
params:
"$WAIT_HANDLE": {get_resource: master_wait_handle}
"$WAIT_HANDLE": {get_resource: cloud_init_wait_handle}
"$HTTP_PROXY": {get_param: http_proxy}
"$HTTPS_PROXY": {get_param: https_proxy}
"$NO_PROXY": {get_param: no_proxy}
@@ -214,6 +226,28 @@ resources:
group: ungrouped
config: {get_file: fragments/write-docker-service.sh}
write_swarm_agent_failure_service:
type: "OS::Heat::SoftwareConfig"
properties:
group: ungrouped
config:
str_replace:
template: {get_file: fragments/write-bay-failure-service.yaml}
params:
"$SERVICE": swarm-agent
"$WAIT_HANDLE": {get_resource: agent_wait_handle}
write_swarm_manager_failure_service:
type: "OS::Heat::SoftwareConfig"
properties:
group: ungrouped
config:
str_replace:
template: {get_file: fragments/write-bay-failure-service.yaml}
params:
"$SERVICE": swarm-manager
"$WAIT_HANDLE": {get_resource: master_wait_handle}
write_docker_socket:
type: "OS::Heat::SoftwareConfig"
properties:
@@ -257,17 +291,14 @@ resources:
config:
str_replace:
template: {get_file: fragments/enable-services.sh}
params: &node_services
params:
"$NODE_SERVICES": "docker.socket swarm-agent swarm-manager"
cfn_signal:
type: "OS::Heat::SoftwareConfig"
properties:
group: ungrouped
config:
str_replace:
template: {get_file: fragments/cfn-signal.sh}
params: *node_services
config: {get_file: fragments/cfn-signal.sh}
disable_selinux:
type: "OS::Heat::SoftwareConfig"
@@ -289,6 +320,8 @@ resources:
- config: {get_resource: remove_docker_key}
- config: {get_resource: write_heat_params}
- config: {get_resource: make_cert}
- config: {get_resource: write_swarm_agent_failure_service}
- config: {get_resource: write_swarm_manager_failure_service}
- config: {get_resource: write_docker_service}
- config: {get_resource: write_docker_socket}
- config: {get_resource: write_swarm_agent_service}

View File

@@ -75,16 +75,16 @@ parameters:
resources:
node_wait_handle:
node_cloud_init_wait_handle:
type: "AWS::CloudFormation::WaitConditionHandle"
node_wait_condition:
node_cloud_init_wait_condition:
type: "AWS::CloudFormation::WaitCondition"
depends_on:
- swarm_node
properties:
Handle:
get_resource: node_wait_handle
get_resource: node_cloud_init_wait_handle
Timeout: 6000
node_agent_wait_handle:
@@ -126,7 +126,7 @@ resources:
str_replace:
template: {get_file: fragments/write-heat-params.yaml}
params:
"$WAIT_HANDLE": {get_resource: node_wait_handle}
"$WAIT_HANDLE": {get_resource: node_cloud_init_wait_handle}
"$HTTP_PROXY": {get_param: http_proxy}
"$HTTPS_PROXY": {get_param: https_proxy}
"$NO_PROXY": {get_param: no_proxy}
@@ -167,6 +167,17 @@ resources:
group: ungrouped
config: {get_file: fragments/write-docker-socket.yaml}
write_swarm_agent_failure_service:
type: "OS::Heat::SoftwareConfig"
properties:
group: ungrouped
config:
str_replace:
template: {get_file: fragments/write-bay-failure-service.yaml}
params:
"$SERVICE": swarm-agent
"$WAIT_HANDLE": {get_resource: node_agent_wait_handle}
write_swarm_agent_service:
type: "OS::Heat::SoftwareConfig"
properties:
@@ -189,17 +200,14 @@ resources:
config:
str_replace:
template: {get_file: fragments/enable-services.sh}
params: &node_services
params:
"$NODE_SERVICES": "docker.socket swarm-agent"
cfn_signal:
type: "OS::Heat::SoftwareConfig"
properties:
group: ungrouped
config:
str_replace:
template: {get_file: fragments/cfn-signal.sh}
params: *node_services
config: {get_file: fragments/cfn-signal.sh}
disable_selinux:
type: "OS::Heat::SoftwareConfig"
@@ -221,6 +229,7 @@ resources:
- config: {get_resource: remove_docker_key}
- config: {get_resource: write_heat_params}
- config: {get_resource: make_cert}
- config: {get_resource: write_swarm_agent_failure_service}
- config: {get_resource: write_swarm_agent_service}
- config: {get_resource: write_docker_service}
- config: {get_resource: write_docker_socket}