tripleo-heat-templates/deployment/containers-common.yaml
Damien Ciabrini 3230f005c1 HA: reorder init_bundle and restart_bundle for improved updates
A pacemaker bundle can be restarted either because:
  . a tripleo config has been updated (from /var/lib/config-data)
  . the bundle config has been updated (container image, bundle
    parameter,...)

In HA services, special container "*_restart_bundle" is in charge
of restarting the HA service on tripleo config change. Special
container "*_init_bundle" handles restart on bundle config change.

When both types of change occur at the same time, the bundle must
be restarted first, so that the container has a chance to be
recreated with all bind-mounts updated before it tries to reload
the updated config.

Implement the improvement with two changes:

1. Make the "*_restart_bundle" start after the "*_init_bundle", and
make sure "*_restart_bundle" is only enabled after the initial
deployment.

2. During minor update, make sure that the "*_restart_bundle" not
only restarts the container, but also waits until the service
is operational (e.g. galera fully promoted to Master). This forces
the rolling restart to happen sequentially, and avoid service
disruption in quorum-based clustered services like galera and
rabbitmq.

Tested the following update use cases:

* minor update: ensure that *_restart_bundle restarts all types of
  resources (OCF, bundles, A/P, A/P Master/Slave).

* minor update: ensure *_restart_bundle is not executed when no
  config or image update happened for a service.

* restart_bundle: when resource (OCF or container) fails to
  restart, bail out early instead of waiting for nothing until
  timeout is reached.

* restart_bundle: make sure a resource is restarted even when it
  is in failed stated when *_restart_bundle is called.

* restart_bundle: A/P can be restarted on any node, so watch
  restart globally. When the resource restarts as Slave, continue
  watching for a Master elsewhere in the cluster.

* restart_bundle: if an A/P is not running locally, make sure it
  doesn't get restarted anywhere else in the cluster.

* restart_bundle: do not try to restart stopped (disabled) or
  unmanaged resource. Bail out early instead, to not wait until
  timeout is reached.

* stack update: make sure that running a stack update with no
  change does not trigger any *_restart_bundle, and does not
  restart any HA container either.

* stack update: when bundle and config will change, ensure bundle
  is updated before HA containers are restarted (e.g. HAProxy
  migration to TLS everywhere)

Change-Id: Ic41d4597e9033f9d7847bb6c10c25f443fbd5b0e
Closes-Bug: #1839858
2020-01-23 16:09:36 +01:00

188 lines
6.1 KiB
YAML

heat_template_version: rocky
description: >
Contains a static list of common things necessary for containers
parameters:
# Required parameters
EndpointMap:
default: {}
description: Mapping of service endpoint -> protocol. Typically set
via parameter_defaults in the resource registry.
type: json
ServiceData:
default: {}
description: Dictionary packing service data
type: json
ServiceNetMap:
default: {}
description: Mapping of service_name -> network name. Typically set
via parameter_defaults in the resource registry. This
mapping overrides those in ServiceNetMapDefaults.
type: json
DefaultPasswords:
default: {}
type: json
RoleName:
default: ''
description: Role name on which the service is applied
type: string
RoleParameters:
default: {}
description: Parameters specific to the role
type: json
EnableInternalTLS:
type: boolean
default: false
InternalTLSCAFile:
default: '/etc/ipa/ca.crt'
type: string
description: Specifies the default CA cert to use if TLS is used for
services in the internal network.
RpcPort:
default: 5672
description: The network port for messaging backend
type: number
PcmkConfigRestartTimeout:
default: 600
description: Time in seconds to wait for a pacemaker resource to restart when
a config change is detected and the resource is being restarted
type: number
ContainerCli:
type: string
default: 'podman'
description: CLI tool used to manage containers.
constraints:
- allowed_values: ['docker', 'podman']
conditions:
internal_tls_enabled: {equals: [{get_param: EnableInternalTLS}, true]}
docker_enabled: {equals: [{get_param: ContainerCli}, 'docker']}
outputs:
container_config_scripts:
description: Shared container config scripts
value:
container_puppet_apply.sh:
mode: "0700"
content: |
#!/bin/bash
set -eux
STEP=$1
TAGS=$2
CONFIG=$3
EXTRA_ARGS=${4:-''}
if [ -d /tmp/puppet-etc ]; then
# ignore copy failures as these may be the same file depending on docker mounts
cp -a /tmp/puppet-etc/* /etc/puppet || true
fi
echo "{\"step\": ${STEP}}" > /etc/puppet/hieradata/docker_puppet.json
# $::deployment_type in puppet-tripleo
export FACTER_deployment_type=containers
set +e
puppet apply $EXTRA_ARGS \
--verbose \
--detailed-exitcodes \
--summarize \
--color=false \
--modulepath /etc/puppet/modules:/opt/stack/puppet-modules:/usr/share/openstack-puppet/modules \
--tags $TAGS \
-e "noop_resource('package'); ${CONFIG}"
rc=$?
set -e
set +ux
if [ $rc -eq 2 -o $rc -eq 0 ]; then
exit 0
fi
exit $rc
pyshim.sh:
mode: "0755"
content: { get_file: ../container_config_scripts/pyshim.sh }
pacemaker_restart_bundle.sh:
mode: "0755"
content:
str_replace:
template: { get_file: ../container_config_scripts/pacemaker_restart_bundle.sh }
params:
__PCMKTIMEOUT__: {get_param: PcmkConfigRestartTimeout}
pacemaker_wait_bundle.sh:
mode: "0755"
content:
str_replace:
template: { get_file: ../container_config_scripts/pacemaker_wait_bundle.sh }
params:
__PCMKTIMEOUT__: {get_param: PcmkConfigRestartTimeout}
volumes_base:
description: Base volume list
value: &volumes_base
list_concat:
- - /etc/hosts:/etc/hosts:ro
- /etc/localtime:/etc/localtime:ro
# OpenSSL trusted CAs
- /etc/pki/ca-trust/extracted:/etc/pki/ca-trust/extracted:ro
- /etc/pki/ca-trust/source/anchors:/etc/pki/ca-trust/source/anchors:ro
- /etc/pki/tls/certs/ca-bundle.crt:/etc/pki/tls/certs/ca-bundle.crt:ro
- /etc/pki/tls/certs/ca-bundle.trust.crt:/etc/pki/tls/certs/ca-bundle.trust.crt:ro
- /etc/pki/tls/cert.pem:/etc/pki/tls/cert.pem:ro
# Syslog socket
- /dev/log:/dev/log
- if:
- internal_tls_enabled
- - list_join:
- ':'
- - {get_param: InternalTLSCAFile}
- {get_param: InternalTLSCAFile}
- 'ro'
- null
volumes:
description: Common volumes for the containers.
value:
list_concat:
- *volumes_base
- - /etc/ssh/ssh_known_hosts:/etc/ssh/ssh_known_hosts:ro
# required for bootstrap_host_exec
- /etc/puppet:/etc/puppet:ro
pacemaker_restart_volumes:
description: Common volumes for the pacemaker restart containers.
value:
list_concat:
- *volumes_base
- - /var/lib/container-config-scripts/pacemaker_restart_bundle.sh:/pacemaker_restart_bundle.sh:ro
- /var/lib/container-config-scripts/pacemaker_wait_bundle.sh:/pacemaker_wait_bundle.sh:ro
- /dev/shm:/dev/shm:rw
# required for bootstrap_host_exec, facter
- /etc/puppet:/etc/puppet:ro
- if:
- docker_enabled
- - /etc/corosync/corosync.conf:/etc/corosync/corosync.conf:ro
- null
container_puppet_apply_volumes:
description: Common volumes needed to run the container_puppet_apply.sh from container_config_scripts
value:
list_concat:
- *volumes_base
- - /var/lib/container-config-scripts/container_puppet_apply.sh:/container_puppet_apply.sh:ro
# container_puppet_apply.sh will copy this to /etc/puppet in the container
- /etc/puppet:/tmp/puppet-etc:ro
- /usr/share/openstack-puppet/modules:/usr/share/openstack-puppet/modules:ro
healthcheck_rpc_port:
description: healthcheck command that probes the RpcPort
value:
test:
str_replace:
template:
'/openstack/healthcheck RPCPORT'
params:
RPCPORT: {get_param: RpcPort}