tripleo-heat-templates/deployment/pacemaker/pacemaker-baremetal-puppet....

381 lines
15 KiB
YAML

heat_template_version: wallaby
description: >
Pacemaker service configured with Puppet
parameters:
ServiceData:
default: {}
description: Dictionary packing service data
type: json
ServiceNetMap:
default: {}
description: Mapping of service_name -> network name. Typically set
via parameter_defaults in the resource registry. Use
parameter_merge_strategies to merge it with the defaults.
type: json
RoleName:
default: ''
description: Role name on which the service is applied
type: string
RoleParameters:
default: {}
description: Parameters specific to the role
type: json
EndpointMap:
default: {}
description: Mapping of service endpoint -> protocol. Typically set
via parameter_defaults in the resource registry.
type: json
MonitoringSubscriptionPacemaker:
default: 'overcloud-pacemaker'
type: string
EnableFencing:
default: false
description: Whether to enable fencing in Pacemaker or not.
type: boolean
PacemakerTLSPriorities:
type: string
description: Pacemaker TLS Priorities
default: ''
PacemakerRemoteAuthkey:
type: string
description: The authkey for the pacemaker remote service.
hidden: true
PcsdPassword:
type: string
description: The password for the 'pcsd' user for pacemaker.
hidden: true
CorosyncSettleTries:
type: number
description: Number of tries for cluster settling. This has the
same default as the pacemaker puppet module. Override
to a smaller value when in need to replace a controller node.
default: 360
FencingConfig:
default: {}
description: |
Pacemaker fencing configuration. The JSON should have
the following structure:
{
"devices": [
{
"agent": "AGENT_NAME",
"host_mac": "HOST_MAC_ADDRESS",
"params": {"PARAM_NAME": "PARAM_VALUE"}
}
]
}
For instance:
{
"devices": [
{
"agent": "fence_xvm",
"host_mac": "52:54:00:aa:bb:cc",
"params": {
"multicast_address": "225.0.0.12",
"port": "baremetal_0",
"manage_fw": true,
"manage_key_file": true,
"key_file": "/etc/fence_xvm.key",
"key_file_password": "abcdef"
}
}
]
}
type: json
PacemakerLoggingSource:
type: json
default:
tag: system.pacemaker
file: /var/log/pacemaker/pacemaker.log
startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
ContainerCli:
type: string
default: 'podman'
description: CLI tool used to manage containers.
constraints:
- allowed_values: ['docker', 'podman']
EnableInstanceHA:
default: false
description: Whether to enable an Instance Ha configurarion or not.
This setup requires the Compute role to have the
PacemakerRemote service added to it.
type: boolean
PacemakerBundleOperationTimeout:
type: string
default: '120s'
description: The timeout for start, monitor and stop operations
run by the container resource agent, in seconds.
Pacemaker's default operation timeout is 20s which
can be reached with high disk IO so setting it to a
safer value of 120s.
constraints:
- allowed_pattern: "([1-9][0-9]*s)"
PacemakerProperties:
type: json
description: |
Pacemaker properties that are set at cluster creation
step. The JSON should have the following structure:
{
'stonith-timeout': {
property: 'stonith-timeout'
value: '120s'
}
}
default: {}
conditions:
pcmk_tls_priorities_set:
not: {equals: [{get_param: PacemakerTLSPriorities}, '']}
outputs:
role_data:
description: Role data for the Pacemaker role.
value:
service_name: pacemaker
monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
firewall_rules:
'130 pacemaker tcp':
proto: 'tcp'
dport:
- 2224
- 3121
- 21064
'131 pacemaker udp':
proto: 'udp'
dport: 5405
config_settings:
pacemaker::corosync::cluster_name: 'tripleo_cluster'
pacemaker::corosync::manage_fw: false
pacemaker::resource_defaults::defaults:
resource-stickiness: { value: INFINITY }
corosync_token_timeout: 10000
pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
pacemaker::resource::bundle::deep_compare: true
pacemaker::resource::ip::deep_compare: true
pacemaker::resource::ocf::deep_compare: true
tripleo::fencing::config: {get_param: FencingConfig}
tripleo::fencing::deep_compare: true
enable_fencing: {get_param: EnableFencing}
hacluster_pwd: {get_param: PcsdPassword}
tripleo::profile::base::pacemaker::cluster_properties: {get_param: PacemakerProperties}
tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
tripleo::profile::base::pacemaker::pcsd_bind_addr:
str_replace:
template:
"%{hiera('$NETWORK')}"
params:
$NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
tripleo::pacemaker::tls_priorities:
if:
- pcmk_tls_priorities_set
- {get_param: PacemakerTLSPriorities}
tripleo::profile::base::pacemaker::resource_op_defaults:
bundle:
name: timeout
value: {get_param: PacemakerBundleOperationTimeout}
service_config_settings:
rsyslog:
tripleo_logging_sources_pacemaker:
- {get_param: PacemakerLoggingSource}
step_config: |
include tripleo::profile::base::pacemaker
host_prep_tasks:
# Need this until https://bugzilla.redhat.com/show_bug.cgi?id=1857247 is fixed
- name: Make sure python3-novaclient is installed when IHA is enabled
package:
name: python3-novaclient
state: present
when: {get_param: EnableInstanceHA}
upgrade_tasks:
# Since Wallaby, Redis is not deployed by defaut anymore
- name: pre-check to ensure redis is removed if the service is disabled
when:
- step|int == 0
- '"redis" not in enabled_services|list'
tags:
- never
- system_upgrade
- system_upgrade_prepare
become: true
# prior to OS upgrade, don't assume the cluster is running,
# probe the CIB file directly instead
shell: |
export CIB_file=/var/lib/pacemaker/cib/cib.xml
if crm_resource -r redis-bundle -q &>/dev/null; then
echo "Redis resource present in pacemaker but disabled by default in TripleO" >&2
echo "Delete the resource in pacemaker or enable the redis service before upgrading" >&2
exit 1
fi
- name: upgrade step 0
when: step|int == 0
block:
# If performing an upgrade which requires operating system upgrading
# a transfer data step needs to be run. During this step, the whole
# pacemaker cluster is stopped so we can't check the cluster status.
# Once the transfer_data step is executed, a flag file is stored.
# This code checks the existence of this file to know if we should
# avoid doing a normal pacemaker upgrade or not. As with the
# operating system upgrade a new cluster will be created in which
# the other nodes will be added.
- name: check flag file existence in destination host
stat:
path: "/var/lib/tripleo/transfer-flags/var-lib-mysql"
register: tripleo_transfer_flag_stat
become: true
delegate_to: "{{ mysql_short_bootstrap_node_name }}"
- name: Set fact cluster_recreate
set_fact:
cluster_recreate: "{{ tripleo_transfer_flag_stat.stat.exists|bool }}"
- name: Check pacemaker cluster running before upgrade
tags: validation
# NOTE: We are intentionally not using the community version of
# pacemaker_cluster here due to variances between the two:
# https://bugs.launchpad.net/tripleo/+bug/1938967
pacemaker_cluster: state=online check_and_fail=true
async: 30
poll: 4
when: not cluster_recreate|bool
- name: Create hiera data to upgrade pacemaker in a stepwise manner.
when:
- step|int == 1
- cluster_recreate|bool
block:
- name: set pacemaker upgrade node facts in a single-node environment
set_fact:
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
cacheable: false
when: groups['pacemaker'] | length <= 1
- name: set pacemaker upgrade node facts from the limit option
set_fact:
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
cacheable: false
when:
- groups['pacemaker'] | length > 1
- item.split('.')[0] in ansible_limit.split(':')
loop: "{{ pacemaker_short_node_names | default([]) }}"
- fail:
msg: >
You can't upgrade pacemaker without staged
upgrade. You need to use the limit option in order
to do so.
when: >-
pacemaker_short_node_names_upgraded is not defined or
pacemaker_short_node_names_upgraded | length == 0
- debug:
msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
- name: set pacemaker node ips fact from the names fact
set_fact:
# Generate matching IPs for the names, e.g. for these varaible values:
# pacemaker_node_ips: [ "1", "2", "3" ]
# pacemaker_short_node_names: [ "a", "b", "c" ]
# pacemaker_short_node_names_override: [ "b" ]
# it will set:
# pacemaker_node_ips_override: [ "2" ]
pacemaker_node_ips_upgraded: "{{
dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
| dict2items
| selectattr('key', 'in', pacemaker_short_node_names_upgraded)
| map(attribute='value')
| list }}"
cacheable: false
- name: add the pacemaker short name to hiera data for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: pacemaker_short_node_names_override
tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
- name: add the pacemaker ips to hiera data for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: pacemaker_node_ips_override
tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
- name: remove the extra hiera data needed for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: remove.yml
vars:
tripleo_upgrade_key: "{{item}}"
loop:
- pacemaker_short_node_names_override
- pacemaker_node_ips_override
when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
- name: upgrade step 2
when: step|int == 2
block:
- name: Stop pacemaker cluster
pacemaker_cluster: state=offline
when: not cluster_recreate|bool
- name: upgrade step 4
when: step|int == 4
block:
- name: Start pacemaker cluster
pacemaker_cluster: state=online
when: not cluster_recreate|bool
external_upgrade_tasks:
- when:
- step|int == 1
tags:
- never
- system_upgrade_stop_services
- system_upgrade_transfer_data
block:
- name: Stop cluster
become: true
shell: |
set -eu
FILE=/usr/sbin/pcs
if test -f "$FILE"; then
/usr/sbin/pcs cluster stop --force
fi
delegate_to: "{{ item }}"
with_items: "{{ groups['pacemaker'] | default([]) }}"
update_tasks:
- name: Check pacemaker cluster running before the minor update
when: step|int == 0 # TODO(marios) disabling validations?
# NOTE: We are intentionally not using the community version of
# pacemaker_cluster here due to variances between the two:
# https://bugs.launchpad.net/tripleo/+bug/1938967
pacemaker_cluster: state=online check_and_fail=true
async: 30
poll: 4
- name: Move virtual IPs to another node before stopping pacemaker
when:
- step|int == 1
- hostvars[inventory_hostname]["haproxy_node_names"]|default([])|length > 1
shell: |
CLUSTER_NODE=$(crm_node -n)
echo "Retrieving all the VIPs which are hosted on this node"
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
for v in ${VIPS_TO_MOVE}; do
echo "Moving VIP $v on another node"
pcs resource ban $v ${CLUSTER_NODE} --wait=300
done
echo "Removing the location constraints that were created to move the VIPs"
for v in ${VIPS_TO_MOVE}; do
echo "Removing location ban for VIP $v"
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
if [ -n "$ban_id" ]; then
pcs constraint remove ${ban_id}
else
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
fi
done
- name: Acquire the cluster shutdown lock to stop pacemaker cluster
when: step|int == 1
command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --acquire
- name: Stop pacemaker cluster
when: step|int == 1
pacemaker_cluster: state=offline
- name: Start pacemaker cluster
when: step|int == 4
pacemaker_cluster: state=online
- name: Release the cluster shutdown lock
when: step|int == 4
command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --release