From 156e315e98dfa7a0ebe759bb78639c66eb0d8b77 Mon Sep 17 00:00:00 2001 From: Andrew Melton Date: Fri, 2 Oct 2015 20:49:30 +0000 Subject: [PATCH] Fix swarm bay failure reporting The old method of detecting failures was very likely to fail in many cases because it relied on all bay services being started by the time cloud-init finished. This is a problem because the bay services are started asynchronously and can take quite a while to start. The new method relies on systemd's OnFailure directive to kick off specific service units when a failure is detected. Both the swarm agent and manager have their own failure service so that we are not overloading a single wait condition with multiple potential failures. Change-Id: I7ce4be567517fe948dde0ac7225996967196c9e8 Closes-bug: #1502329 --- .../docker-swarm/fragments/cfn-signal.sh | 16 ------- .../fragments/write-bay-failure-service.yaml | 16 +++++++ .../fragments/write-swarm-agent-service.yaml | 3 +- .../fragments/write-swarm-master-service.sh | 3 +- magnum/templates/docker-swarm/swarm.yaml | 45 ++++++++++++++++--- magnum/templates/docker-swarm/swarmnode.yaml | 27 +++++++---- 6 files changed, 77 insertions(+), 33 deletions(-) create mode 100644 magnum/templates/docker-swarm/fragments/write-bay-failure-service.yaml diff --git a/magnum/templates/docker-swarm/fragments/cfn-signal.sh b/magnum/templates/docker-swarm/fragments/cfn-signal.sh index 19993235d3..18b6e9a48c 100644 --- a/magnum/templates/docker-swarm/fragments/cfn-signal.sh +++ b/magnum/templates/docker-swarm/fragments/cfn-signal.sh @@ -7,22 +7,6 @@ echo "notifying heat" STATUS="SUCCESS" REASON="Setup complete" DATA="OK" -FAILED_SERVICE="" - -for service in $NODE_SERVICES; do - echo "checking service status for $service" - systemctl status $service - if [[ $? -ne 0 ]]; then - echo "$service is not active, the cluster is not valid" - FAILED_SERVICE="$FAILED_SERVICE $service" - fi -done - -if [[ -n $FAILED_SERVICE ]]; then - STATUS="FAILURE" - REASON="Setup failed, $FAILED_SERVICE not start up correctly." - DATA="Failed" -fi data=$(echo '{"Status": "'${STATUS}'", "Reason": "'$REASON'", "Data": "'${DATA}'", "UniqueId": "00000"}') diff --git a/magnum/templates/docker-swarm/fragments/write-bay-failure-service.yaml b/magnum/templates/docker-swarm/fragments/write-bay-failure-service.yaml new file mode 100644 index 0000000000..6dfde5eccc --- /dev/null +++ b/magnum/templates/docker-swarm/fragments/write-bay-failure-service.yaml @@ -0,0 +1,16 @@ +#cloud-config +merge_how: dict(recurse_array)+list(append) +write_files: + - path: /etc/systemd/system/$SERVICE-failure.service + owner: "root:root" + permissions: "0644" + content: | + [Unit] + Description=$SERVICE Failure Notifier + + [Service] + Type=simple + TimeoutStartSec=0 + ExecStart=/usr/bin/curl -sf -X PUT -H 'Content-Type: application/json' \ + --data-binary '{"Status": "FAILURE", "Reason": "$SERVICE service failed to start.", "Data": "OK", "UniqueId": "00000"}' \ + "$WAIT_HANDLE" diff --git a/magnum/templates/docker-swarm/fragments/write-swarm-agent-service.yaml b/magnum/templates/docker-swarm/fragments/write-swarm-agent-service.yaml index 85ca284f77..d459242e1c 100644 --- a/magnum/templates/docker-swarm/fragments/write-swarm-agent-service.yaml +++ b/magnum/templates/docker-swarm/fragments/write-swarm-agent-service.yaml @@ -9,12 +9,13 @@ write_files: Description=Swarm Agent After=docker.service Requires=docker.service + OnFailure=swarm-agent-failure.service [Service] TimeoutStartSec=0 ExecStartPre=-/usr/bin/docker kill swarm-agent ExecStartPre=-/usr/bin/docker rm swarm-agent - ExecStartPre=/usr/bin/docker pull swarm:0.2.0 + ExecStartPre=-/usr/bin/docker pull swarm:0.2.0 #TODO: roll-back from swarm:0.2.0 to swarm if atomic image can work with latest swarm image ExecStart=/usr/bin/docker run -e http_proxy=$HTTP_PROXY -e https_proxy=$HTTPS_PROXY -e no_proxy=$NO_PROXY --name swarm-agent swarm:0.2.0 join --addr $NODE_IP:2375 $DISCOVERY_URL ExecStop=/usr/bin/docker stop swarm-agent diff --git a/magnum/templates/docker-swarm/fragments/write-swarm-master-service.sh b/magnum/templates/docker-swarm/fragments/write-swarm-master-service.sh index bd3e99a16c..27956786d8 100644 --- a/magnum/templates/docker-swarm/fragments/write-swarm-master-service.sh +++ b/magnum/templates/docker-swarm/fragments/write-swarm-master-service.sh @@ -5,12 +5,13 @@ cat > /etc/systemd/system/swarm-manager.service << END_SERVICE_TOP Description=Swarm Manager After=docker.service Requires=docker.service +OnFailure=swarm-manager-failure.service [Service] TimeoutStartSec=0 ExecStartPre=-/usr/bin/docker kill swarm-manager ExecStartPre=-/usr/bin/docker rm swarm-manager -ExecStartPre=/usr/bin/docker pull swarm:0.2.0 +ExecStartPre=-/usr/bin/docker pull swarm:0.2.0 #TODO: roll-back from swarm:0.2.0 to swarm if atomic image can work with latest swarm image ExecStart=/usr/bin/docker run --name swarm-manager \\ -v /etc/docker:/etc/docker \\ diff --git a/magnum/templates/docker-swarm/swarm.yaml b/magnum/templates/docker-swarm/swarm.yaml index 11a579e3bc..62d2e16012 100644 --- a/magnum/templates/docker-swarm/swarm.yaml +++ b/magnum/templates/docker-swarm/swarm.yaml @@ -87,6 +87,18 @@ parameters: resources: + cloud_init_wait_handle: + type: "AWS::CloudFormation::WaitConditionHandle" + + cloud_init_wait_condition: + type: "AWS::CloudFormation::WaitCondition" + depends_on: + - swarm_master + properties: + Handle: + get_resource: cloud_init_wait_handle + Timeout: 6000 + master_wait_handle: type: "AWS::CloudFormation::WaitConditionHandle" @@ -179,7 +191,7 @@ resources: str_replace: template: {get_file: fragments/write-heat-params.yaml} params: - "$WAIT_HANDLE": {get_resource: master_wait_handle} + "$WAIT_HANDLE": {get_resource: cloud_init_wait_handle} "$HTTP_PROXY": {get_param: http_proxy} "$HTTPS_PROXY": {get_param: https_proxy} "$NO_PROXY": {get_param: no_proxy} @@ -214,6 +226,28 @@ resources: group: ungrouped config: {get_file: fragments/write-docker-service.sh} + write_swarm_agent_failure_service: + type: "OS::Heat::SoftwareConfig" + properties: + group: ungrouped + config: + str_replace: + template: {get_file: fragments/write-bay-failure-service.yaml} + params: + "$SERVICE": swarm-agent + "$WAIT_HANDLE": {get_resource: agent_wait_handle} + + write_swarm_manager_failure_service: + type: "OS::Heat::SoftwareConfig" + properties: + group: ungrouped + config: + str_replace: + template: {get_file: fragments/write-bay-failure-service.yaml} + params: + "$SERVICE": swarm-manager + "$WAIT_HANDLE": {get_resource: master_wait_handle} + write_docker_socket: type: "OS::Heat::SoftwareConfig" properties: @@ -257,17 +291,14 @@ resources: config: str_replace: template: {get_file: fragments/enable-services.sh} - params: &node_services + params: "$NODE_SERVICES": "docker.socket swarm-agent swarm-manager" cfn_signal: type: "OS::Heat::SoftwareConfig" properties: group: ungrouped - config: - str_replace: - template: {get_file: fragments/cfn-signal.sh} - params: *node_services + config: {get_file: fragments/cfn-signal.sh} disable_selinux: type: "OS::Heat::SoftwareConfig" @@ -289,6 +320,8 @@ resources: - config: {get_resource: remove_docker_key} - config: {get_resource: write_heat_params} - config: {get_resource: make_cert} + - config: {get_resource: write_swarm_agent_failure_service} + - config: {get_resource: write_swarm_manager_failure_service} - config: {get_resource: write_docker_service} - config: {get_resource: write_docker_socket} - config: {get_resource: write_swarm_agent_service} diff --git a/magnum/templates/docker-swarm/swarmnode.yaml b/magnum/templates/docker-swarm/swarmnode.yaml index 4deb30cd8a..229b164335 100644 --- a/magnum/templates/docker-swarm/swarmnode.yaml +++ b/magnum/templates/docker-swarm/swarmnode.yaml @@ -75,16 +75,16 @@ parameters: resources: - node_wait_handle: + node_cloud_init_wait_handle: type: "AWS::CloudFormation::WaitConditionHandle" - node_wait_condition: + node_cloud_init_wait_condition: type: "AWS::CloudFormation::WaitCondition" depends_on: - swarm_node properties: Handle: - get_resource: node_wait_handle + get_resource: node_cloud_init_wait_handle Timeout: 6000 node_agent_wait_handle: @@ -126,7 +126,7 @@ resources: str_replace: template: {get_file: fragments/write-heat-params.yaml} params: - "$WAIT_HANDLE": {get_resource: node_wait_handle} + "$WAIT_HANDLE": {get_resource: node_cloud_init_wait_handle} "$HTTP_PROXY": {get_param: http_proxy} "$HTTPS_PROXY": {get_param: https_proxy} "$NO_PROXY": {get_param: no_proxy} @@ -167,6 +167,17 @@ resources: group: ungrouped config: {get_file: fragments/write-docker-socket.yaml} + write_swarm_agent_failure_service: + type: "OS::Heat::SoftwareConfig" + properties: + group: ungrouped + config: + str_replace: + template: {get_file: fragments/write-bay-failure-service.yaml} + params: + "$SERVICE": swarm-agent + "$WAIT_HANDLE": {get_resource: node_agent_wait_handle} + write_swarm_agent_service: type: "OS::Heat::SoftwareConfig" properties: @@ -189,17 +200,14 @@ resources: config: str_replace: template: {get_file: fragments/enable-services.sh} - params: &node_services + params: "$NODE_SERVICES": "docker.socket swarm-agent" cfn_signal: type: "OS::Heat::SoftwareConfig" properties: group: ungrouped - config: - str_replace: - template: {get_file: fragments/cfn-signal.sh} - params: *node_services + config: {get_file: fragments/cfn-signal.sh} disable_selinux: type: "OS::Heat::SoftwareConfig" @@ -221,6 +229,7 @@ resources: - config: {get_resource: remove_docker_key} - config: {get_resource: write_heat_params} - config: {get_resource: make_cert} + - config: {get_resource: write_swarm_agent_failure_service} - config: {get_resource: write_swarm_agent_service} - config: {get_resource: write_docker_service} - config: {get_resource: write_docker_socket}