Fix swarm bay failure reporting
The old method of detecting failures was very likely to fail in many cases because it relied on all bay services being started by the time cloud-init finished. This is a problem because the bay services are started asynchronously and can take quite a while to start. The new method relies on systemd's OnFailure directive to kick off specific service units when a failure is detected. Both the swarm agent and manager have their own failure service so that we are not overloading a single wait condition with multiple potential failures. Change-Id: I7ce4be567517fe948dde0ac7225996967196c9e8 Closes-bug: #1502329
This commit is contained in:
@@ -7,22 +7,6 @@ echo "notifying heat"
|
||||
STATUS="SUCCESS"
|
||||
REASON="Setup complete"
|
||||
DATA="OK"
|
||||
FAILED_SERVICE=""
|
||||
|
||||
for service in $NODE_SERVICES; do
|
||||
echo "checking service status for $service"
|
||||
systemctl status $service
|
||||
if [[ $? -ne 0 ]]; then
|
||||
echo "$service is not active, the cluster is not valid"
|
||||
FAILED_SERVICE="$FAILED_SERVICE $service"
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ -n $FAILED_SERVICE ]]; then
|
||||
STATUS="FAILURE"
|
||||
REASON="Setup failed, $FAILED_SERVICE not start up correctly."
|
||||
DATA="Failed"
|
||||
fi
|
||||
|
||||
data=$(echo '{"Status": "'${STATUS}'", "Reason": "'$REASON'", "Data": "'${DATA}'", "UniqueId": "00000"}')
|
||||
|
||||
|
||||
@@ -0,0 +1,16 @@
|
||||
#cloud-config
|
||||
merge_how: dict(recurse_array)+list(append)
|
||||
write_files:
|
||||
- path: /etc/systemd/system/$SERVICE-failure.service
|
||||
owner: "root:root"
|
||||
permissions: "0644"
|
||||
content: |
|
||||
[Unit]
|
||||
Description=$SERVICE Failure Notifier
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
TimeoutStartSec=0
|
||||
ExecStart=/usr/bin/curl -sf -X PUT -H 'Content-Type: application/json' \
|
||||
--data-binary '{"Status": "FAILURE", "Reason": "$SERVICE service failed to start.", "Data": "OK", "UniqueId": "00000"}' \
|
||||
"$WAIT_HANDLE"
|
||||
@@ -9,12 +9,13 @@ write_files:
|
||||
Description=Swarm Agent
|
||||
After=docker.service
|
||||
Requires=docker.service
|
||||
OnFailure=swarm-agent-failure.service
|
||||
|
||||
[Service]
|
||||
TimeoutStartSec=0
|
||||
ExecStartPre=-/usr/bin/docker kill swarm-agent
|
||||
ExecStartPre=-/usr/bin/docker rm swarm-agent
|
||||
ExecStartPre=/usr/bin/docker pull swarm:0.2.0
|
||||
ExecStartPre=-/usr/bin/docker pull swarm:0.2.0
|
||||
#TODO: roll-back from swarm:0.2.0 to swarm if atomic image can work with latest swarm image
|
||||
ExecStart=/usr/bin/docker run -e http_proxy=$HTTP_PROXY -e https_proxy=$HTTPS_PROXY -e no_proxy=$NO_PROXY --name swarm-agent swarm:0.2.0 join --addr $NODE_IP:2375 $DISCOVERY_URL
|
||||
ExecStop=/usr/bin/docker stop swarm-agent
|
||||
|
||||
@@ -5,12 +5,13 @@ cat > /etc/systemd/system/swarm-manager.service << END_SERVICE_TOP
|
||||
Description=Swarm Manager
|
||||
After=docker.service
|
||||
Requires=docker.service
|
||||
OnFailure=swarm-manager-failure.service
|
||||
|
||||
[Service]
|
||||
TimeoutStartSec=0
|
||||
ExecStartPre=-/usr/bin/docker kill swarm-manager
|
||||
ExecStartPre=-/usr/bin/docker rm swarm-manager
|
||||
ExecStartPre=/usr/bin/docker pull swarm:0.2.0
|
||||
ExecStartPre=-/usr/bin/docker pull swarm:0.2.0
|
||||
#TODO: roll-back from swarm:0.2.0 to swarm if atomic image can work with latest swarm image
|
||||
ExecStart=/usr/bin/docker run --name swarm-manager \\
|
||||
-v /etc/docker:/etc/docker \\
|
||||
|
||||
@@ -87,6 +87,18 @@ parameters:
|
||||
|
||||
resources:
|
||||
|
||||
cloud_init_wait_handle:
|
||||
type: "AWS::CloudFormation::WaitConditionHandle"
|
||||
|
||||
cloud_init_wait_condition:
|
||||
type: "AWS::CloudFormation::WaitCondition"
|
||||
depends_on:
|
||||
- swarm_master
|
||||
properties:
|
||||
Handle:
|
||||
get_resource: cloud_init_wait_handle
|
||||
Timeout: 6000
|
||||
|
||||
master_wait_handle:
|
||||
type: "AWS::CloudFormation::WaitConditionHandle"
|
||||
|
||||
@@ -179,7 +191,7 @@ resources:
|
||||
str_replace:
|
||||
template: {get_file: fragments/write-heat-params.yaml}
|
||||
params:
|
||||
"$WAIT_HANDLE": {get_resource: master_wait_handle}
|
||||
"$WAIT_HANDLE": {get_resource: cloud_init_wait_handle}
|
||||
"$HTTP_PROXY": {get_param: http_proxy}
|
||||
"$HTTPS_PROXY": {get_param: https_proxy}
|
||||
"$NO_PROXY": {get_param: no_proxy}
|
||||
@@ -214,6 +226,28 @@ resources:
|
||||
group: ungrouped
|
||||
config: {get_file: fragments/write-docker-service.sh}
|
||||
|
||||
write_swarm_agent_failure_service:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
properties:
|
||||
group: ungrouped
|
||||
config:
|
||||
str_replace:
|
||||
template: {get_file: fragments/write-bay-failure-service.yaml}
|
||||
params:
|
||||
"$SERVICE": swarm-agent
|
||||
"$WAIT_HANDLE": {get_resource: agent_wait_handle}
|
||||
|
||||
write_swarm_manager_failure_service:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
properties:
|
||||
group: ungrouped
|
||||
config:
|
||||
str_replace:
|
||||
template: {get_file: fragments/write-bay-failure-service.yaml}
|
||||
params:
|
||||
"$SERVICE": swarm-manager
|
||||
"$WAIT_HANDLE": {get_resource: master_wait_handle}
|
||||
|
||||
write_docker_socket:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
properties:
|
||||
@@ -257,17 +291,14 @@ resources:
|
||||
config:
|
||||
str_replace:
|
||||
template: {get_file: fragments/enable-services.sh}
|
||||
params: &node_services
|
||||
params:
|
||||
"$NODE_SERVICES": "docker.socket swarm-agent swarm-manager"
|
||||
|
||||
cfn_signal:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
properties:
|
||||
group: ungrouped
|
||||
config:
|
||||
str_replace:
|
||||
template: {get_file: fragments/cfn-signal.sh}
|
||||
params: *node_services
|
||||
config: {get_file: fragments/cfn-signal.sh}
|
||||
|
||||
disable_selinux:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
@@ -289,6 +320,8 @@ resources:
|
||||
- config: {get_resource: remove_docker_key}
|
||||
- config: {get_resource: write_heat_params}
|
||||
- config: {get_resource: make_cert}
|
||||
- config: {get_resource: write_swarm_agent_failure_service}
|
||||
- config: {get_resource: write_swarm_manager_failure_service}
|
||||
- config: {get_resource: write_docker_service}
|
||||
- config: {get_resource: write_docker_socket}
|
||||
- config: {get_resource: write_swarm_agent_service}
|
||||
|
||||
@@ -75,16 +75,16 @@ parameters:
|
||||
|
||||
resources:
|
||||
|
||||
node_wait_handle:
|
||||
node_cloud_init_wait_handle:
|
||||
type: "AWS::CloudFormation::WaitConditionHandle"
|
||||
|
||||
node_wait_condition:
|
||||
node_cloud_init_wait_condition:
|
||||
type: "AWS::CloudFormation::WaitCondition"
|
||||
depends_on:
|
||||
- swarm_node
|
||||
properties:
|
||||
Handle:
|
||||
get_resource: node_wait_handle
|
||||
get_resource: node_cloud_init_wait_handle
|
||||
Timeout: 6000
|
||||
|
||||
node_agent_wait_handle:
|
||||
@@ -126,7 +126,7 @@ resources:
|
||||
str_replace:
|
||||
template: {get_file: fragments/write-heat-params.yaml}
|
||||
params:
|
||||
"$WAIT_HANDLE": {get_resource: node_wait_handle}
|
||||
"$WAIT_HANDLE": {get_resource: node_cloud_init_wait_handle}
|
||||
"$HTTP_PROXY": {get_param: http_proxy}
|
||||
"$HTTPS_PROXY": {get_param: https_proxy}
|
||||
"$NO_PROXY": {get_param: no_proxy}
|
||||
@@ -167,6 +167,17 @@ resources:
|
||||
group: ungrouped
|
||||
config: {get_file: fragments/write-docker-socket.yaml}
|
||||
|
||||
write_swarm_agent_failure_service:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
properties:
|
||||
group: ungrouped
|
||||
config:
|
||||
str_replace:
|
||||
template: {get_file: fragments/write-bay-failure-service.yaml}
|
||||
params:
|
||||
"$SERVICE": swarm-agent
|
||||
"$WAIT_HANDLE": {get_resource: node_agent_wait_handle}
|
||||
|
||||
write_swarm_agent_service:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
properties:
|
||||
@@ -189,17 +200,14 @@ resources:
|
||||
config:
|
||||
str_replace:
|
||||
template: {get_file: fragments/enable-services.sh}
|
||||
params: &node_services
|
||||
params:
|
||||
"$NODE_SERVICES": "docker.socket swarm-agent"
|
||||
|
||||
cfn_signal:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
properties:
|
||||
group: ungrouped
|
||||
config:
|
||||
str_replace:
|
||||
template: {get_file: fragments/cfn-signal.sh}
|
||||
params: *node_services
|
||||
config: {get_file: fragments/cfn-signal.sh}
|
||||
|
||||
disable_selinux:
|
||||
type: "OS::Heat::SoftwareConfig"
|
||||
@@ -221,6 +229,7 @@ resources:
|
||||
- config: {get_resource: remove_docker_key}
|
||||
- config: {get_resource: write_heat_params}
|
||||
- config: {get_resource: make_cert}
|
||||
- config: {get_resource: write_swarm_agent_failure_service}
|
||||
- config: {get_resource: write_swarm_agent_service}
|
||||
- config: {get_resource: write_docker_service}
|
||||
- config: {get_resource: write_docker_socket}
|
||||
|
||||
Reference in New Issue
Block a user