tripleo-heat-templates/deployment/pacemaker/pacemaker-baremetal-puppet....

369 lines
15 KiB
YAML

heat_template_version: rocky
description: >
Pacemaker service configured with Puppet
parameters:
ServiceData:
default: {}
description: Dictionary packing service data
type: json
ServiceNetMap:
default: {}
description: Mapping of service_name -> network name. Typically set
via parameter_defaults in the resource registry. This
mapping overrides those in ServiceNetMapDefaults.
type: json
DefaultPasswords:
default: {}
type: json
RoleName:
default: ''
description: Role name on which the service is applied
type: string
RoleParameters:
default: {}
description: Parameters specific to the role
type: json
EndpointMap:
default: {}
description: Mapping of service endpoint -> protocol. Typically set
via parameter_defaults in the resource registry.
type: json
MonitoringSubscriptionPacemaker:
default: 'overcloud-pacemaker'
type: string
CorosyncIPv6:
default: false
description: Enable IPv6 in Corosync
type: boolean
EnableFencing:
default: false
description: Whether to enable fencing in Pacemaker or not.
type: boolean
PacemakerTLSPriorities:
type: string
description: Pacemaker TLS Priorities
default: ''
PacemakerRemoteAuthkey:
type: string
description: The authkey for the pacemaker remote service.
hidden: true
PcsdPassword:
type: string
description: The password for the 'pcsd' user for pacemaker.
hidden: true
CorosyncSettleTries:
type: number
description: Number of tries for cluster settling. This has the
same default as the pacemaker puppet module. Override
to a smaller value when in need to replace a controller node.
default: 360
FencingConfig:
default: {}
description: |
Pacemaker fencing configuration. The JSON should have
the following structure:
{
"devices": [
{
"agent": "AGENT_NAME",
"host_mac": "HOST_MAC_ADDRESS",
"params": {"PARAM_NAME": "PARAM_VALUE"}
}
]
}
For instance:
{
"devices": [
{
"agent": "fence_xvm",
"host_mac": "52:54:00:aa:bb:cc",
"params": {
"multicast_address": "225.0.0.12",
"port": "baremetal_0",
"manage_fw": true,
"manage_key_file": true,
"key_file": "/etc/fence_xvm.key",
"key_file_password": "abcdef"
}
}
]
}
type: json
PacemakerLoggingSource:
type: json
default:
tag: system.pacemaker
file: /var/log/host/pacemaker/pacemaker.log
startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
ContainerCli:
type: string
default: 'podman'
description: CLI tool used to manage containers.
constraints:
- allowed_values: ['docker', 'podman']
PacemakerBundleOperationTimeout:
type: string
default: ''
description: The timeout for start, monitor and stop operations
run by the container resource agent, in seconds.
When set to default '', the timeout comes from
pacemaker's default operation timeouts (20s). When
set to default and podman is used, force the timeout
to 120s.
constraints:
- allowed_pattern: "([1-9][0-9]*s)?"
conditions:
pcmk_tls_priorities_empty: {equals: [{get_param: PacemakerTLSPriorities}, '']}
pcmk_bundle_op_timeout_empty: {equals: [{get_param: PacemakerBundleOperationTimeout}, '']}
podman_enabled: {equals: [{get_param: ContainerCli}, 'podman']}
outputs:
role_data:
description: Role data for the Pacemaker role.
value:
service_name: pacemaker
monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
config_settings:
map_merge:
- pacemaker::corosync::cluster_name: 'tripleo_cluster'
pacemaker::corosync::manage_fw: false
pacemaker::resource_defaults::defaults:
resource-stickiness: { value: INFINITY }
corosync_token_timeout: 10000
pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
pacemaker::resource::bundle::deep_compare: true
pacemaker::resource::ip::deep_compare: true
pacemaker::resource::ocf::deep_compare: true
tripleo.pacemaker.firewall_rules:
'130 pacemaker tcp':
proto: 'tcp'
dport:
- 2224
- 3121
- 21064
'131 pacemaker udp':
proto: 'udp'
dport: 5405
corosync_ipv6: {get_param: CorosyncIPv6}
tripleo::fencing::config: {get_param: FencingConfig}
tripleo::fencing::deep_compare: true
enable_fencing: {get_param: EnableFencing}
hacluster_pwd:
yaql:
expression: $.data.passwords.where($ != '').first()
data:
passwords:
- {get_param: PcsdPassword}
- {get_param: [DefaultPasswords, pcsd_password]}
tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
tripleo::profile::base::pacemaker::pcsd_bind_addr:
str_replace:
template:
"%{hiera('$NETWORK')}"
params:
$NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
-
if:
- pcmk_tls_priorities_empty
- {}
- tripleo::pacemaker::tls_priorities: {get_param: PacemakerTLSPriorities}
-
if:
- and:
- pcmk_bundle_op_timeout_empty
- not: podman_enabled
- {}
- tripleo::profile::base::pacemaker::resource_op_defaults:
bundle:
name: timeout
value:
if:
- pcmk_bundle_op_timeout_empty
- '120s'
- {get_param: PacemakerBundleOperationTimeout}
service_config_settings:
rsyslog:
tripleo_logging_sources_pacemaker:
- {get_param: PacemakerLoggingSource}
step_config: |
include ::tripleo::profile::base::pacemaker
upgrade_tasks:
- name: upgrade step 0
when: step|int == 0
block:
# If performing an upgrade which requires operating system upgrading
# a transfer data step needs to be run. During this step, the whole
# pacemaker cluster is stopped so we can't check the cluster status.
# Once the transfer_data step is executed, a flag file is stored.
# This code checks the existence of this file to know if we should
# avoid doing a normal pacemaker upgrade or not. As with the
# operating system upgrade a new cluster will be created in which
# the other nodes will be added.
- name: check flag file existence in destination host
stat:
path: "/var/lib/tripleo/transfer-flags/var-lib-mysql"
register: tripleo_transfer_flag_stat
become: true
delegate_to: "{{ mysql_short_bootstrap_node_name }}"
- name: Set fact cluster_recreate
set_fact:
cluster_recreate: "{{ tripleo_transfer_flag_stat.stat.exists|bool }}"
- name: Check pacemaker cluster running before upgrade
tags: validation
pacemaker_cluster: state=online check_and_fail=true
async: 30
poll: 4
when: not cluster_recreate|bool
- name: Create hiera data to upgrade pacemaker in a stepwise manner.
when:
- step|int == 1
- cluster_recreate|bool
block:
- name: set pacemaker upgrade node facts in a single-node environment
set_fact:
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
cacheable: no
when: groups['pacemaker'] | length <= 1
- name: set pacemaker upgrade node facts from the limit option
set_fact:
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
cacheable: no
when:
- groups['pacemaker'] | length > 1
- item.split('.')[0] in ansible_limit.split(':')
loop: "{{ pacemaker_short_node_names }}"
- fail:
msg: >
You can't upgrade pacemaker without staged
upgrade. You need to use the limit option in order
to do so.
when: >-
pacemaker_short_node_names_upgraded is not defined or
pacemaker_short_node_names_upgraded | length == 0
- debug:
msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
- name: set pacemaker node ips fact from the names fact
set_fact:
# Generate matching IPs for the names, e.g. for these varaible values:
# pacemaker_node_ips: [ "1", "2", "3" ]
# pacemaker_short_node_names: [ "a", "b", "c" ]
# pacemaker_short_node_names_override: [ "b" ]
# it will set:
# pacemaker_node_ips_override: [ "2" ]
pacemaker_node_ips_upgraded: "{{
dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
| dict2items
| selectattr('key', 'in', pacemaker_short_node_names_upgraded)
| map(attribute='value')
| list }}"
cacheable: no
- name: add the pacemaker short name to hiera data for the upgrade.
include_role:
name: tripleo-upgrade-hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: pacemaker_short_node_names_override
tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
- name: add the pacemaker ips to hiera data for the upgrade.
include_role:
name: tripleo-upgrade-hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: pacemaker_node_ips_override
tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
- name: remove the extra hiera data needed for the upgrade.
include_role:
name: tripleo-upgrade-hiera
tasks_from: remove.yml
vars:
tripleo_upgrade_key: "{{item}}"
loop:
- pacemaker_short_node_names_override
- pacemaker_node_ips_override
when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
- name: upgrade step 2
when: step|int == 2
block:
- name: Stop pacemaker cluster
pacemaker_cluster: state=offline
when: not cluster_recreate|bool
- name: upgrade step 4
when: step|int == 4
block:
- name: Start pacemaker cluster
pacemaker_cluster: state=online
when: not cluster_recreate|bool
external_upgrade_tasks:
- when:
- step|int == 1
tags:
- never
- system_upgrade_stop_services
- system_upgrade_transfer_data
block:
- name: Stop cluster
become: true
shell: |
set -eu
FILE=/usr/sbin/pcs
if test -f "$FILE"; then
/usr/sbin/pcs cluster stop --force
fi
delegate_to: "{{ item }}"
with_items: "{{ groups['pacemaker'] | default([]) }}"
update_tasks:
- name: Check pacemaker cluster running before the minor update
when: step|int == 0 # TODO(marios) disabling validations?
pacemaker_cluster: state=online check_and_fail=true
async: 30
poll: 4
- name: Move virtual IPs to another node before stopping pacemaker
when:
- step|int == 1
- hostvars[inventory_hostname]["haproxy_node_names"]|default([])|length > 1
shell: |
CLUSTER_NODE=$(crm_node -n)
echo "Retrieving all the VIPs which are hosted on this node"
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
for v in ${VIPS_TO_MOVE}; do
echo "Moving VIP $v on another node"
pcs resource move $v --wait=300
done
echo "Removing the location constraints that were created to move the VIPs"
for v in ${VIPS_TO_MOVE}; do
echo "Removing location ban for VIP $v"
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
if [ -n "$ban_id" ]; then
pcs constraint remove ${ban_id}
else
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
fi
done
- name: Set the cluster bundle timeout
when: step|int == 1
vars:
pacemaker_bundle_operation_timeout: {get_param: PacemakerBundleOperationTimeout}
container_cli: {get_param: ContainerCli}
block:
- name: Change the bundle operation timeout
command: "pcs resource op defaults timeout={{ pacemaker_bundle_operation_timeout|ternary(pacemaker_bundle_operation_timeout,'120s') }}"
when:
- "container_cli == 'podman'"
- name: Acquire the cluster shutdown lock to stop pacemaker cluster
when: step|int == 1
command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --acquire
- name: Stop pacemaker cluster
when: step|int == 1
pacemaker_cluster: state=offline
- name: Start pacemaker cluster
when: step|int == 4
pacemaker_cluster: state=online
- name: Release the cluster shutdown lock
when: step|int == 4
command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --release