9f53e0ce5c
Docker support was deprecated when we switched to CentOS 8 and has not been tested since then. Change-Id: I5575d344190537571ace16596e3635aeddccfe23
381 lines
15 KiB
YAML
381 lines
15 KiB
YAML
heat_template_version: wallaby
|
|
|
|
description: >
|
|
Pacemaker service configured with Puppet
|
|
|
|
parameters:
|
|
ServiceData:
|
|
default: {}
|
|
description: Dictionary packing service data
|
|
type: json
|
|
ServiceNetMap:
|
|
default: {}
|
|
description: Mapping of service_name -> network name. Typically set
|
|
via parameter_defaults in the resource registry. Use
|
|
parameter_merge_strategies to merge it with the defaults.
|
|
type: json
|
|
RoleName:
|
|
default: ''
|
|
description: Role name on which the service is applied
|
|
type: string
|
|
RoleParameters:
|
|
default: {}
|
|
description: Parameters specific to the role
|
|
type: json
|
|
EndpointMap:
|
|
default: {}
|
|
description: Mapping of service endpoint -> protocol. Typically set
|
|
via parameter_defaults in the resource registry.
|
|
type: json
|
|
MonitoringSubscriptionPacemaker:
|
|
default: 'overcloud-pacemaker'
|
|
type: string
|
|
EnableFencing:
|
|
default: false
|
|
description: Whether to enable fencing in Pacemaker or not.
|
|
type: boolean
|
|
PacemakerTLSPriorities:
|
|
type: string
|
|
description: Pacemaker TLS Priorities
|
|
default: ''
|
|
PacemakerRemoteAuthkey:
|
|
type: string
|
|
description: The authkey for the pacemaker remote service.
|
|
hidden: true
|
|
PcsdPassword:
|
|
type: string
|
|
description: The password for the 'pcsd' user for pacemaker.
|
|
hidden: true
|
|
CorosyncSettleTries:
|
|
type: number
|
|
description: Number of tries for cluster settling. This has the
|
|
same default as the pacemaker puppet module. Override
|
|
to a smaller value when in need to replace a controller node.
|
|
default: 360
|
|
FencingConfig:
|
|
default: {}
|
|
description: |
|
|
Pacemaker fencing configuration. The JSON should have
|
|
the following structure:
|
|
{
|
|
"devices": [
|
|
{
|
|
"agent": "AGENT_NAME",
|
|
"host_mac": "HOST_MAC_ADDRESS",
|
|
"params": {"PARAM_NAME": "PARAM_VALUE"}
|
|
}
|
|
]
|
|
}
|
|
For instance:
|
|
{
|
|
"devices": [
|
|
{
|
|
"agent": "fence_xvm",
|
|
"host_mac": "52:54:00:aa:bb:cc",
|
|
"params": {
|
|
"multicast_address": "225.0.0.12",
|
|
"port": "baremetal_0",
|
|
"manage_fw": true,
|
|
"manage_key_file": true,
|
|
"key_file": "/etc/fence_xvm.key",
|
|
"key_file_password": "abcdef"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
type: json
|
|
PacemakerLoggingSource:
|
|
type: json
|
|
default:
|
|
tag: system.pacemaker
|
|
file: /var/log/pacemaker/pacemaker.log
|
|
startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
|
|
ContainerCli:
|
|
type: string
|
|
default: 'podman'
|
|
description: CLI tool used to manage containers.
|
|
constraints:
|
|
- allowed_values: ['podman']
|
|
EnableInstanceHA:
|
|
default: false
|
|
description: Whether to enable an Instance Ha configurarion or not.
|
|
This setup requires the Compute role to have the
|
|
PacemakerRemote service added to it.
|
|
type: boolean
|
|
PacemakerBundleOperationTimeout:
|
|
type: string
|
|
default: '120s'
|
|
description: The timeout for start, monitor and stop operations
|
|
run by the container resource agent, in seconds.
|
|
Pacemaker's default operation timeout is 20s which
|
|
can be reached with high disk IO so setting it to a
|
|
safer value of 120s.
|
|
constraints:
|
|
- allowed_pattern: "([1-9][0-9]*s)"
|
|
PacemakerProperties:
|
|
type: json
|
|
description: |
|
|
Pacemaker properties that are set at cluster creation
|
|
step. The JSON should have the following structure:
|
|
{
|
|
'stonith-timeout': {
|
|
property: 'stonith-timeout'
|
|
value: '120s'
|
|
}
|
|
}
|
|
default: {}
|
|
|
|
conditions:
|
|
pcmk_tls_priorities_set:
|
|
not: {equals: [{get_param: PacemakerTLSPriorities}, '']}
|
|
|
|
outputs:
|
|
role_data:
|
|
description: Role data for the Pacemaker role.
|
|
value:
|
|
service_name: pacemaker
|
|
monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
|
|
firewall_rules:
|
|
'130 pacemaker tcp':
|
|
proto: 'tcp'
|
|
dport:
|
|
- 2224
|
|
- 3121
|
|
- 21064
|
|
'131 pacemaker udp':
|
|
proto: 'udp'
|
|
dport: 5405
|
|
config_settings:
|
|
pacemaker::corosync::cluster_name: 'tripleo_cluster'
|
|
pacemaker::corosync::manage_fw: false
|
|
pacemaker::resource_defaults::defaults:
|
|
resource-stickiness: { value: INFINITY }
|
|
corosync_token_timeout: 10000
|
|
pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
|
|
pacemaker::resource::bundle::deep_compare: true
|
|
pacemaker::resource::ip::deep_compare: true
|
|
pacemaker::resource::ocf::deep_compare: true
|
|
tripleo::fencing::config: {get_param: FencingConfig}
|
|
tripleo::fencing::deep_compare: true
|
|
enable_fencing: {get_param: EnableFencing}
|
|
hacluster_pwd: {get_param: PcsdPassword}
|
|
tripleo::profile::base::pacemaker::cluster_properties: {get_param: PacemakerProperties}
|
|
tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
|
|
tripleo::profile::base::pacemaker::pcsd_bind_addr:
|
|
str_replace:
|
|
template:
|
|
"%{lookup('$NETWORK')}"
|
|
params:
|
|
$NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
|
|
tripleo::pacemaker::tls_priorities:
|
|
if:
|
|
- pcmk_tls_priorities_set
|
|
- {get_param: PacemakerTLSPriorities}
|
|
tripleo::profile::base::pacemaker::resource_op_defaults:
|
|
bundle:
|
|
name: timeout
|
|
value: {get_param: PacemakerBundleOperationTimeout}
|
|
service_config_settings:
|
|
rsyslog:
|
|
tripleo_logging_sources_pacemaker:
|
|
- {get_param: PacemakerLoggingSource}
|
|
step_config: |
|
|
include tripleo::profile::base::pacemaker
|
|
host_prep_tasks:
|
|
# Need this until https://bugzilla.redhat.com/show_bug.cgi?id=1857247 is fixed
|
|
- name: Make sure python3-novaclient is installed when IHA is enabled
|
|
package:
|
|
name: python3-novaclient
|
|
state: present
|
|
when: {get_param: EnableInstanceHA}
|
|
upgrade_tasks:
|
|
# Since Wallaby, Redis is not deployed by defaut anymore
|
|
- name: pre-check to ensure redis is removed if the service is disabled
|
|
when:
|
|
- step|int == 0
|
|
- '"redis" not in enabled_services|list'
|
|
tags:
|
|
- never
|
|
- system_upgrade
|
|
- system_upgrade_prepare
|
|
become: true
|
|
# prior to OS upgrade, don't assume the cluster is running,
|
|
# probe the CIB file directly instead
|
|
shell: |
|
|
export CIB_file=/var/lib/pacemaker/cib/cib.xml
|
|
if crm_resource -r redis-bundle -q &>/dev/null; then
|
|
echo "Redis resource present in pacemaker but disabled by default in TripleO" >&2
|
|
echo "Delete the resource in pacemaker or enable the redis service before upgrading" >&2
|
|
exit 1
|
|
fi
|
|
- name: upgrade step 0
|
|
when: step|int == 0
|
|
block:
|
|
# If performing an upgrade which requires operating system upgrading
|
|
# a transfer data step needs to be run. During this step, the whole
|
|
# pacemaker cluster is stopped so we can't check the cluster status.
|
|
# Once the transfer_data step is executed, a flag file is stored.
|
|
# This code checks the existence of this file to know if we should
|
|
# avoid doing a normal pacemaker upgrade or not. As with the
|
|
# operating system upgrade a new cluster will be created in which
|
|
# the other nodes will be added.
|
|
- name: check flag file existence in destination host
|
|
stat:
|
|
path: "/var/lib/tripleo/transfer-flags/var-lib-mysql"
|
|
register: tripleo_transfer_flag_stat
|
|
become: true
|
|
delegate_to: "{{ mysql_short_bootstrap_node_name }}"
|
|
- name: Set fact cluster_recreate
|
|
set_fact:
|
|
cluster_recreate: "{{ tripleo_transfer_flag_stat.stat.exists|bool }}"
|
|
- name: Check pacemaker cluster running before upgrade
|
|
tags: validation
|
|
# NOTE: We are intentionally not using the community version of
|
|
# pacemaker_cluster here due to variances between the two:
|
|
# https://bugs.launchpad.net/tripleo/+bug/1938967
|
|
pacemaker_cluster: state=online check_and_fail=true
|
|
async: 30
|
|
poll: 4
|
|
when: not cluster_recreate|bool
|
|
- name: Create hiera data to upgrade pacemaker in a stepwise manner.
|
|
when:
|
|
- step|int == 1
|
|
- cluster_recreate|bool
|
|
block:
|
|
- name: set pacemaker upgrade node facts in a single-node environment
|
|
set_fact:
|
|
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
|
|
cacheable: false
|
|
when: groups['pacemaker'] | length <= 1
|
|
- name: set pacemaker upgrade node facts from the limit option
|
|
set_fact:
|
|
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
|
|
cacheable: false
|
|
when:
|
|
- groups['pacemaker'] | length > 1
|
|
- item.split('.')[0] in ansible_limit.split(':')
|
|
loop: "{{ pacemaker_short_node_names | default([]) }}"
|
|
- fail:
|
|
msg: >
|
|
You can't upgrade pacemaker without staged
|
|
upgrade. You need to use the limit option in order
|
|
to do so.
|
|
when: >-
|
|
pacemaker_short_node_names_upgraded is not defined or
|
|
pacemaker_short_node_names_upgraded | length == 0
|
|
- debug:
|
|
msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
|
|
- name: set pacemaker node ips fact from the names fact
|
|
set_fact:
|
|
# Generate matching IPs for the names, e.g. for these varaible values:
|
|
# pacemaker_node_ips: [ "1", "2", "3" ]
|
|
# pacemaker_short_node_names: [ "a", "b", "c" ]
|
|
# pacemaker_short_node_names_override: [ "b" ]
|
|
# it will set:
|
|
# pacemaker_node_ips_override: [ "2" ]
|
|
pacemaker_node_ips_upgraded: "{{
|
|
dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
|
|
| dict2items
|
|
| selectattr('key', 'in', pacemaker_short_node_names_upgraded)
|
|
| map(attribute='value')
|
|
| list }}"
|
|
cacheable: false
|
|
|
|
- name: add the pacemaker short name to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: pacemaker_short_node_names_override
|
|
tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
|
|
- name: add the pacemaker ips to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: pacemaker_node_ips_override
|
|
tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
|
|
- name: remove the extra hiera data needed for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: remove.yml
|
|
vars:
|
|
tripleo_upgrade_key: "{{item}}"
|
|
loop:
|
|
- pacemaker_short_node_names_override
|
|
- pacemaker_node_ips_override
|
|
when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
|
|
- name: upgrade step 2
|
|
when: step|int == 2
|
|
block:
|
|
- name: Stop pacemaker cluster
|
|
pacemaker_cluster: state=offline
|
|
when: not cluster_recreate|bool
|
|
- name: upgrade step 4
|
|
when: step|int == 4
|
|
block:
|
|
- name: Start pacemaker cluster
|
|
pacemaker_cluster: state=online
|
|
when: not cluster_recreate|bool
|
|
external_upgrade_tasks:
|
|
- when:
|
|
- step|int == 1
|
|
tags:
|
|
- never
|
|
- system_upgrade_stop_services
|
|
- system_upgrade_transfer_data
|
|
block:
|
|
- name: Stop cluster
|
|
become: true
|
|
shell: |
|
|
set -eu
|
|
FILE=/usr/sbin/pcs
|
|
if test -f "$FILE"; then
|
|
/usr/sbin/pcs cluster stop --force
|
|
fi
|
|
delegate_to: "{{ item }}"
|
|
with_items: "{{ groups['pacemaker'] | default([]) }}"
|
|
update_tasks:
|
|
- name: Check pacemaker cluster running before the minor update
|
|
when: step|int == 0 # TODO(marios) disabling validations?
|
|
# NOTE: We are intentionally not using the community version of
|
|
# pacemaker_cluster here due to variances between the two:
|
|
# https://bugs.launchpad.net/tripleo/+bug/1938967
|
|
pacemaker_cluster: state=online check_and_fail=true
|
|
async: 30
|
|
poll: 4
|
|
- name: Move virtual IPs to another node before stopping pacemaker
|
|
when:
|
|
- step|int == 1
|
|
- hostvars[inventory_hostname]["haproxy_node_names"]|default([])|length > 1
|
|
shell: |
|
|
CLUSTER_NODE=$(crm_node -n)
|
|
echo "Retrieving all the VIPs which are hosted on this node"
|
|
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
|
|
for v in ${VIPS_TO_MOVE}; do
|
|
echo "Moving VIP $v on another node"
|
|
pcs resource ban $v ${CLUSTER_NODE} --wait=300
|
|
done
|
|
echo "Removing the location constraints that were created to move the VIPs"
|
|
for v in ${VIPS_TO_MOVE}; do
|
|
echo "Removing location ban for VIP $v"
|
|
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
|
|
if [ -n "$ban_id" ]; then
|
|
pcs constraint remove ${ban_id}
|
|
else
|
|
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
|
|
fi
|
|
done
|
|
- name: Acquire the cluster shutdown lock to stop pacemaker cluster
|
|
when: step|int == 1
|
|
command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --acquire
|
|
- name: Stop pacemaker cluster
|
|
when: step|int == 1
|
|
pacemaker_cluster: state=offline
|
|
- name: Start pacemaker cluster
|
|
when: step|int == 4
|
|
pacemaker_cluster: state=online
|
|
- name: Release the cluster shutdown lock
|
|
when: step|int == 4
|
|
command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --release
|