Pacemaker maintenance mode for the duration of Puppet run on update

This enables pacemaker maintenantce mode when running Puppet on stack
update. Puppet can try to restart some overcloud services, which
pacemaker tries to prevent, and this can result in a failed Puppet run.

At the end of the puppet run, certain pacemaker resources are restarted
in an additional SoftwareDeployment to make sure that any config changes
have been fully applied. This is only done on stack updates (when
UpdateIdentifier is set to something), because the assumption is that on
stack create services already come up with the correct config.

(Change I9556085424fa3008d7f596578b58e7c33a336f75 has been squashed into
this one.)

Change-Id: I4d40358c511fc1f95b78a859e943082aaea17899
Co-Authored-By: Jiri Stransky <jistr@redhat.com>
Co-Authored-By: James Slagle <jslagle@redhat.com>
This commit is contained in:
Steven Hardy 2015-11-13 11:18:50 +00:00 committed by Jiri Stransky
parent 5a6907f2b8
commit ea1294fe9b
9 changed files with 186 additions and 1 deletions

View File

@ -2,3 +2,5 @@
# Overcloud controller with Pacemaker.
resource_registry:
OS::TripleO::ControllerConfig: ../puppet/controller-config-pacemaker.yaml
OS::TripleO::Tasks::ControllerPrePuppet: ../extraconfig/tasks/pre_puppet_pacemaker.yaml
OS::TripleO::Tasks::ControllerPostPuppet: ../extraconfig/tasks/post_puppet_pacemaker.yaml

View File

@ -0,0 +1,10 @@
heat_template_version: 2014-10-16
description: 'No-op task'
parameters:
servers:
type: json
input_values:
type: json
default: {}
description: input values for the software deployments

View File

@ -0,0 +1,63 @@
#!/bin/bash
set -eux
pacemaker_status=$(systemctl is-active pacemaker)
check_interval=3
function check_resource {
service=$1
state=$2
timeout=$3
tstart=$(date +%s)
tend=$(( $tstart + $timeout ))
if [ "$state" = "stopped" ]; then
match_for_incomplete='Started'
else # started
match_for_incomplete='Stopped'
fi
while (( $(date +%s) < $tend )); do
node_states=$(pcs status --full | grep "$service" | grep -v Clone)
if echo "$node_states" | grep -q "$match_for_incomplete"; then
echo "$service not yet $state, sleeping $check_interval seconds."
sleep $check_interval
else
echo "$service has $state"
return
fi
done
echo "$service never $state after $timeout seconds" | tee /dev/fd/2
exit 1
}
# Run if pacemaker is running, we're the bootstrap node,
# and we're updating the deployment (not creating).
if [ "$pacemaker_status" = "active" -a \
"$(hiera bootstrap_nodeid)" = "$(facter hostname)" -a \
"$(hiera update_identifier)" != "nil" ]; then
pcs resource disable httpd
check_resource httpd stopped 300
pcs resource disable openstack-keystone
check_resource openstack-keystone stopped 1200
if pcs status | grep haproxy-clone; then
pcs resource restart haproxy-clone
fi
pcs resource restart redis-master
pcs resource restart mongod-clone
pcs resource restart rabbitmq-clone
pcs resource restart memcached-clone
pcs resource restart galera-master
pcs resource enable openstack-keystone
check_resource openstack-keystone started 300
pcs resource enable httpd
check_resource httpd started 800
fi

View File

@ -0,0 +1,44 @@
heat_template_version: 2014-10-16
description: 'Post-Puppet Config for Pacemaker deployments'
parameters:
servers:
type: json
input_values:
type: json
description: input values for the software deployments
resources:
ControllerPostPuppetMaintenanceModeConfig:
type: OS::Heat::SoftwareConfig
properties:
group: script
config: |
#!/bin/bash
pacemaker_status=$(systemctl is-active pacemaker)
if [ "$pacemaker_status" = "active" ]; then
pcs property set maintenance-mode=false
fi
ControllerPostPuppetMaintenanceModeDeployment:
type: OS::Heat::SoftwareDeployments
properties:
servers: {get_param: servers}
config: {get_resource: ControllerPostPuppetMaintenanceModeConfig}
input_values: {get_param: input_values}
ControllerPostPuppetRestartConfig:
type: OS::Heat::SoftwareConfig
properties:
group: script
config: {get_file: pacemaker_resource_restart.sh}
ControllerPostPuppetRestartDeployment:
type: OS::Heat::SoftwareDeployments
depends_on: ControllerPostPuppetMaintenanceModeDeployment
properties:
servers: {get_param: servers}
config: {get_resource: ControllerPostPuppetRestartConfig}
input_values: {get_param: input_values}

View File

@ -0,0 +1,30 @@
heat_template_version: 2014-10-16
description: 'Pre-Puppet Config for Pacemaker deployments'
parameters:
servers:
type: json
input_values:
type: json
description: input values for the software deployments
resources:
ControllerPrePuppetMaintenanceModeConfig:
type: OS::Heat::SoftwareConfig
properties:
group: script
config: |
#!/bin/bash
pacemaker_status=$(systemctl is-active pacemaker)
if [ "$pacemaker_status" = "active" ]; then
pcs property set maintenance-mode=true
fi
ControllerPrePuppetMaintenanceModeDeployment:
type: OS::Heat::SoftwareDeployments
properties:
servers: {get_param: servers}
config: {get_resource: ControllerPrePuppetMaintenanceModeConfig}
input_values: {get_param: input_values}

View File

@ -21,7 +21,11 @@ resource_registry:
OS::TripleO::CephClusterConfig::SoftwareConfig: puppet/ceph-cluster-config.yaml
OS::TripleO::AllNodes::SoftwareConfig: puppet/all-nodes-config.yaml
OS::TripleO::BootstrapNode::SoftwareConfig: puppet/bootstrap-config.yaml
# Tasks (for internal TripleO usage)
OS::TripleO::Tasks::PackageUpdate: extraconfig/tasks/yum_update.yaml
OS::TripleO::Tasks::ControllerPrePuppet: extraconfig/tasks/noop.yaml
OS::TripleO::Tasks::ControllerPostPuppet: extraconfig/tasks/noop.yaml
# This creates the "heat-admin" user for all OS images by default
# To disable, replace with firstboot/userdata_default.yaml

View File

@ -1127,6 +1127,8 @@ resources:
neutron_api_node_ips: {get_attr: [ControllerIpListMap, net_ip_map, {get_param: [ServiceNetMap, NeutronApiNetwork]}]}
keystone_public_api_node_ips: {get_attr: [ControllerIpListMap, net_ip_map, {get_param: [ServiceNetMap, KeystonePublicApiNetwork]}]}
keystone_admin_api_node_ips: {get_attr: [ControllerIpListMap, net_ip_map, {get_param: [ServiceNetMap, KeystoneAdminApiNetwork]}]}
DeployIdentifier: {get_param: DeployIdentifier}
UpdateIdentifier: {get_param: UpdateIdentifier}
MysqlRootPassword:
type: OS::Heat::RandomString

View File

@ -51,6 +51,17 @@ parameters:
keystone_admin_api_node_ips:
type: comma_delimited_list
DeployIdentifier:
type: string
description: >
Setting this to a unique value will re-run any deployment tasks which
perform configuration on a Heat stack-update.
UpdateIdentifier:
type: string
description: >
Setting to a previously unused value during stack-update will trigger
package update on all nodes
resources:
allNodesConfigImpl:
@ -240,6 +251,9 @@ resources:
nova::rabbit_hosts: *rabbit_nodes_array
keystone::rabbit_hosts: *rabbit_nodes_array
deploy_identifier: {get_param: DeployIdentifier}
update_identifier: {get_param: UpdateIdentifier}
outputs:
config_id:
description: The ID of the allNodesConfigImpl resource.

View File

@ -17,6 +17,13 @@ parameters:
resources:
ControllerPrePuppet:
type: OS::TripleO::Tasks::ControllerPrePuppet
properties:
servers: {get_param: servers}
input_values:
update_identifier: {get_param: NodeConfigIdentifiers}
ControllerPuppetConfig:
type: OS::TripleO::ControllerConfig
@ -26,6 +33,7 @@ resources:
# e.g all Deployment resources should have a *Deployment_StepN suffix
ControllerLoadBalancerDeployment_Step1:
type: OS::Heat::StructuredDeployments
depends_on: ControllerPrePuppet
properties:
servers: {get_param: servers}
config: {get_resource: ControllerPuppetConfig}
@ -98,10 +106,18 @@ resources:
step: 5
update_identifier: {get_param: NodeConfigIdentifiers}
ControllerPostPuppet:
type: OS::TripleO::Tasks::ControllerPostPuppet
depends_on: ControllerOvercloudServicesDeployment_Step6
properties:
servers: {get_param: servers}
input_values:
update_identifier: {get_param: NodeConfigIdentifiers}
# Note, this should come last, so use depends_on to ensure
# this is created after any other resources.
ExtraConfig:
depends_on: ControllerOvercloudServicesDeployment_Step5
depends_on: ControllerPostPuppet
type: OS::TripleO::NodeExtraConfigPost
properties:
servers: {get_param: servers}