HA: fix <service>_restart_bundle with minor update workflow

For each HA service we have a paunch container <service>_restart_bundle
which is started by paunch whenever config files changes during stack
deploy/update. This container runs a pcs command on a single node to
restart all the service's containers (e.g. all galera on all controllers).
By design, when it is run, configs have already been regenerated by the
deploy tasks on all nodes.

For minor updates, the workflow runs differently: all the steps of the
deploy tasks are run one node after the other, so when
<service>_restart_bundle is called, there is no guarantee that the
service's configs have been regenerated on all the nodes yet.

To fix the wrong restart behaviour, only restart local containers when
running during a minor update. And run once per node. When the minor
update workflow calls <service>_restart_container, we still have the
guarantee that the config files are already regenerated locally.

Co-Authored-By: Michele Baldessari <michele@acksyn.org>
Co-Authored-By: Luca Miccini <lmiccini@redhat.com>

Change-Id: I92d4ddf2feeac06ce14468ae928c283f3fd04f45
Closes-Bug: #1841629
This commit is contained in:
Damien Ciabrini 2019-08-28 18:25:43 +02:00
parent 759df5a6be
commit 7f785e8757
12 changed files with 93 additions and 200 deletions

View File

@ -0,0 +1,40 @@
#!/bin/bash
set -u
# ./pacemaker_restart_bundle.sh galera-bundle galera
RESOURCE=$1
TRIPLEO_SERVICE=$2
# try to restart only if resource has been created already
if /usr/sbin/pcs resource show $RESOURCE; then
if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ]; then
# During a stack update, this script is called in parallel on
# every node the resource runs on, after the service's configs
# have been updated on all nodes. So we need to run pcs only
# once (e.g. on the service's boostrap node).
echo "$(date -u): Restarting ${RESOURCE} globally"
/usr/bin/bootstrap_host_exec $TRIPLEO_SERVICE /sbin/pcs resource restart --wait=__PCMKTIMEOUT__ $RESOURCE
else
# During a minor update workflow however, a host gets fully
# updated before updating the next one. So unlike stack
# update, at the time this script is called, the service's
# configs aren't updated on all nodes yet. So only restart the
# resource locally, where it's guaranteed that the config is
# up to date.
HOST=$(facter hostname)
# XPath rationale: as long as there is a bundle running
# locally and it is managed by pacemaker, no matter the state
# of any inner pcmk_remote or ocf resource, we should restart
# it to give it a chance to read the new config.
# XPath rationale 2: if the resource is being stopped, the
# attribute "target_role" will be present in the output of
# crm_mon. Do not restart the resource if that is the case.
if crm_mon -r --as-xml | xmllint --format --xpath "//bundle[@id='${RESOURCE}']/replica/resource[@managed='true' and (not(boolean(@target_role)) or (boolean(@target_role) and @target_role!='Stopped'))]/node[@name='${HOST}']/../.." - &>/dev/null; then
echo "$(date -u): Restarting ${RESOURCE} locally on '${HOST}'"
/sbin/pcs resource restart --wait=__PCMKTIMEOUT__ $RESOURCE "${HOST}"
else
echo "$(date -u): Resource ${RESOURCE} currently not running on '${HOST}', no restart needed"
fi
fi
fi

View File

@ -52,11 +52,6 @@ parameters:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -187,24 +182,12 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'cinder_backup'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show openstack-cinder-backup; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT openstack-cinder-backup; echo "openstack-cinder-backup restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh openstack-cinder-backup cinder_backup
image: {get_param: ContainerCinderBackupImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/cinder/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/cinder/:/var/lib/kolla/config_files/src:ro
cinder_backup_init_bundle:
start_order: 1
detach: false

View File

@ -40,11 +40,6 @@ parameters:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -172,24 +167,12 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'cinder_volume'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show openstack-cinder-volume; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT openstack-cinder-volume; echo "openstack-cinder-volume restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh openstack-cinder-volume cinder_volume
image: {get_param: ContainerCinderVolumeImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/cinder/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/cinder/:/var/lib/kolla/config_files/src:ro
cinder_volume_init_bundle:
start_order: 1
detach: false

View File

@ -47,6 +47,12 @@ parameters:
description: The network port for messaging backend
type: number
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pacemaker resource to restart when
a config change is detected and the resource is being restarted
type: number
conditions:
internal_tls_enabled: {equals: [{get_param: EnableInternalTLS}, true]}
@ -90,6 +96,13 @@ outputs:
pyshim.sh:
mode: "0755"
content: { get_file: ../container_config_scripts/pyshim.sh }
pacemaker_restart_bundle.sh:
mode: "0755"
content:
str_replace:
template: { get_file: ../container_config_scripts/pacemaker_restart_bundle.sh }
params:
__PCMKTIMEOUT__: {get_param: PcmkConfigRestartTimeout}
volumes_base:
description: Base volume list
@ -123,6 +136,17 @@ outputs:
# required for bootstrap_host_exec
- /etc/puppet:/etc/puppet:ro
pacemaker_restart_volumes:
description: Common volumes for the pacemaker restart containers.
value:
list_concat:
- *volumes_base
- - /var/lib/container-config-scripts/pacemaker_restart_bundle.sh:/pacemaker_restart_bundle.sh:ro
- /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- /dev/shm:/dev/shm:rw
# required for bootstrap_host_exec, facter
- /etc/puppet:/etc/puppet:ro
container_puppet_apply_volumes:
description: Common volumes needed to run the container_puppet_apply.sh from container_config_scripts
value:

View File

@ -67,11 +67,6 @@ parameters:
description: >
Setting this to a unique value will re-run any deployment tasks which
perform configuration on a Heat stack-update.
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -271,24 +266,12 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'mysql'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show galera-bundle; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT galera-bundle; echo "galera-bundle restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh galera-bundle mysql
image: {get_param: ContainerMysqlImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/mysql/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/mysql/:/var/lib/kolla/config_files/src:ro
mysql_init_bundle:
start_order: 1
detach: false

View File

@ -47,11 +47,6 @@ parameters:
default: false
description: Enable IPv6 in Redis
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -219,24 +214,12 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'redis'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show redis-bundle; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT redis-bundle; echo "redis-bundle restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh redis-bundle redis
image: {get_param: ContainerRedisConfigImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/redis/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/redis/:/var/lib/kolla/config_files/src:ro
- redis_init_bundle:
start_order: 2
detach: false

View File

@ -82,11 +82,6 @@ parameters:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -238,24 +233,12 @@ outputs:
config_volume: haproxy
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'haproxy'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show haproxy-bundle; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT haproxy-bundle; echo "haproxy-bundle restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh haproxy-bundle haproxy
image: {get_param: ContainerHAProxyImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/haproxy/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/haproxy/:/var/lib/kolla/config_files/src:ro
haproxy_init_bundle:
start_order: 3
detach: false

View File

@ -40,11 +40,6 @@ parameters:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -153,24 +148,12 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'manila_share'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show openstack-manila-share; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT openstack-manila-share; echo "openstack-manila-share restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh openstack-manila-share manila_share
image: {get_param: ContainerManilaShareImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/manila/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/manila/:/var/lib/kolla/config_files/src:ro
manila_share_init_bundle:
start_order: 1
detach: false

View File

@ -48,11 +48,6 @@ parameters:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -146,22 +141,9 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'ovn_dbs'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show ovn-dbs-bundle; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT ovn-dbs-bundle; echo "ovn-dbs-bundle restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh ovn-dbs-bundle ovn_dbs
image: {get_param: ContainerOvnDbsConfigImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
volumes: {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
ovn_dbs_init_bundle:
start_order: 1
detach: false

View File

@ -44,11 +44,6 @@ parameters:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -194,24 +189,12 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'oslo_messaging_notify'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show rabbitmq-bundle; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT rabbitmq-bundle; echo "rabbitmq-bundle restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh rabbitmq-bundle oslo_messaging_notify
image: {get_param: ContainerRabbitmqImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/rabbitmq/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/rabbitmq/:/var/lib/kolla/config_files/src:ro
rabbitmq_init_bundle:
start_order: 1
detach: false

View File

@ -44,11 +44,6 @@ parameters:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -194,24 +189,12 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'rabbitmq'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show rabbitmq-bundle; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT rabbitmq-bundle; echo "rabbitmq-bundle restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh rabbitmq-bundle rabbitmq
image: {get_param: ContainerRabbitmqImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/rabbitmq/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/rabbitmq/:/var/lib/kolla/config_files/src:ro
rabbitmq_init_bundle:
start_order: 1
detach: false

View File

@ -44,11 +44,6 @@ parameters:
default: false
description: Whether to run config management (e.g. Puppet) in debug mode.
type: boolean
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pcmk resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
@ -194,24 +189,12 @@ outputs:
user: root
environment:
- TRIPLEO_MINOR_UPDATE
command:
- '/usr/bin/bootstrap_host_exec'
- 'oslo_messaging_rpc'
- str_replace:
template:
'if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ] && /usr/sbin/pcs resource show rabbitmq-bundle; then /usr/sbin/pcs resource restart --wait=PCMKTIMEOUT rabbitmq-bundle; echo "rabbitmq-bundle restart invoked"; fi'
params:
PCMKTIMEOUT: {get_param: PcmkConfigRestartTimeout}
command: /pacemaker_restart_bundle.sh rabbitmq-bundle oslo_messaging_rpc
image: {get_param: ContainerRabbitmqImage}
volumes:
list_concat:
- {get_attr: [ContainersCommon, volumes]}
-
- /var/lib/config-data/puppet-generated/rabbitmq/:/var/lib/kolla/config_files/src:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
- {get_attr: [ContainersCommon, pacemaker_restart_volumes]}
- - /var/lib/config-data/puppet-generated/rabbitmq/:/var/lib/kolla/config_files/src:ro
rabbitmq_init_bundle:
start_order: 1
detach: false