tripleo-heat-templates/deployment/pacemaker/pacemaker-baremetal-puppet....

343 lines
13 KiB
YAML

heat_template_version: rocky
description: >
Pacemaker service configured with Puppet
parameters:
ServiceData:
default: {}
description: Dictionary packing service data
type: json
ServiceNetMap:
default: {}
description: Mapping of service_name -> network name. Typically set
via parameter_defaults in the resource registry. This
mapping overrides those in ServiceNetMapDefaults.
type: json
DefaultPasswords:
default: {}
type: json
RoleName:
default: ''
description: Role name on which the service is applied
type: string
RoleParameters:
default: {}
description: Parameters specific to the role
type: json
EndpointMap:
default: {}
description: Mapping of service endpoint -> protocol. Typically set
via parameter_defaults in the resource registry.
type: json
MonitoringSubscriptionPacemaker:
default: 'overcloud-pacemaker'
type: string
CorosyncIPv6:
default: false
description: Enable IPv6 in Corosync
type: boolean
EnableFencing:
default: false
description: Whether to enable fencing in Pacemaker or not.
type: boolean
PacemakerTLSPriorities:
type: string
description: Pacemaker TLS Priorities
default: ''
PacemakerRemoteAuthkey:
type: string
description: The authkey for the pacemaker remote service.
hidden: true
PcsdPassword:
type: string
description: The password for the 'pcsd' user for pacemaker.
hidden: true
CorosyncSettleTries:
type: number
description: Number of tries for cluster settling. This has the
same default as the pacemaker puppet module. Override
to a smaller value when in need to replace a controller node.
default: 360
FencingConfig:
default: {}
description: |
Pacemaker fencing configuration. The JSON should have
the following structure:
{
"devices": [
{
"agent": "AGENT_NAME",
"host_mac": "HOST_MAC_ADDRESS",
"params": {"PARAM_NAME": "PARAM_VALUE"}
}
]
}
For instance:
{
"devices": [
{
"agent": "fence_xvm",
"host_mac": "52:54:00:aa:bb:cc",
"params": {
"multicast_address": "225.0.0.12",
"port": "baremetal_0",
"manage_fw": true,
"manage_key_file": true,
"key_file": "/etc/fence_xvm.key",
"key_file_password": "abcdef"
}
}
]
}
type: json
PacemakerLoggingSource:
type: json
default:
tag: system.pacemaker
file: /var/log/host/pacemaker.log
startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
UpgradeLeappEnabled:
description: Use Leapp for operating system upgrade
type: boolean
default: true
ContainerCli:
type: string
default: 'podman'
description: CLI tool used to manage containers.
constraints:
- allowed_values: ['docker', 'podman']
PacemakerBundleOperationTimeout:
type: string
default: ''
description: The timeout for start, monitor and stop operations
run by the container resource agent, in seconds.
When set to default '', the timeout comes from
pacemaker's default operation timeouts (20s). When
set to default and podman is used, force the timeout
to 120s.
constraints:
- allowed_pattern: "([1-9][0-9]*s)?"
conditions:
pcmk_tls_priorities_empty: {equals: [{get_param: PacemakerTLSPriorities}, '']}
pcmk_bundle_op_timeout_empty: {equals: [{get_param: PacemakerBundleOperationTimeout}, '']}
podman_enabled: {equals: [{get_param: ContainerCli}, 'podman']}
outputs:
role_data:
description: Role data for the Pacemaker role.
value:
service_name: pacemaker
monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
firewall_rules:
'130 pacemaker tcp':
proto: 'tcp'
dport:
- 2224
- 3121
- 21064
'131 pacemaker udp':
proto: 'udp'
dport: 5405
config_settings:
map_merge:
- pacemaker::corosync::cluster_name: 'tripleo_cluster'
pacemaker::corosync::manage_fw: false
pacemaker::resource_defaults::defaults:
resource-stickiness: { value: INFINITY }
corosync_token_timeout: 10000
pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
pacemaker::resource::bundle::deep_compare: true
pacemaker::resource::ip::deep_compare: true
pacemaker::resource::ocf::deep_compare: true
corosync_ipv6: {get_param: CorosyncIPv6}
tripleo::fencing::config: {get_param: FencingConfig}
tripleo::fencing::deep_compare: true
enable_fencing: {get_param: EnableFencing}
hacluster_pwd:
yaql:
expression: $.data.passwords.where($ != '').first()
data:
passwords:
- {get_param: PcsdPassword}
- {get_param: [DefaultPasswords, pcsd_password]}
tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
tripleo::profile::base::pacemaker::pcsd_bind_addr:
str_replace:
template:
"%{hiera('$NETWORK')}"
params:
$NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
-
if:
- pcmk_tls_priorities_empty
- {}
- tripleo::pacemaker::tls_priorities: {get_param: PacemakerTLSPriorities}
-
if:
- and:
- pcmk_bundle_op_timeout_empty
- not: podman_enabled
- {}
- tripleo::profile::base::pacemaker::resource_op_defaults:
bundle:
name: timeout
value:
if:
- pcmk_bundle_op_timeout_empty
- '120s'
- {get_param: PacemakerBundleOperationTimeout}
service_config_settings:
rsyslog:
tripleo_logging_sources_pacemaker:
- {get_param: PacemakerLoggingSource}
step_config: |
include ::tripleo::profile::base::pacemaker
upgrade_tasks:
- name: upgrade step 0
when: step|int == 0
vars:
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
block:
- name: Check pacemaker cluster running before upgrade
tags: validation
pacemaker_cluster: state=online check_and_fail=true
async: 30
poll: 4
when: not upgrade_leapp_enabled|bool
- name: upgrade step 1
when:
- step|int == 1
block:
- name: set pacemaker upgrade node facts in a single-node environment
set_fact:
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
cacheable: no
when: groups['pacemaker'] | length <= 1
- name: set pacemaker upgrade node facts from the limit option
set_fact:
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
cacheable: no
when:
- groups['pacemaker'] | length > 1
- item.split('.')[0] in ansible_limit.split(',')
loop: "{{ pacemaker_short_node_names }}"
- debug:
msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
- fail:
msg: >
You can't upgrade pacemaker without staged
upgrade. You need to use the limit option in order
to do so.
when: >-
pacemaker_short_node_names_upgraded is not defined or
pacemaker_short_node_names_upgraded | length == 0
- name: set pacemaker node ips fact from the names fact
set_fact:
# Generate matching IPs for the names, e.g. for these varaible values:
# pacemaker_node_ips: [ "1", "2", "3" ]
# pacemaker_short_node_names: [ "a", "b", "c" ]
# pacemaker_short_node_names_override: [ "b" ]
# it will set:
# pacemaker_node_ips_override: [ "2" ]
pacemaker_node_ips_upgraded: "{{
dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
| dict2items
| selectattr('key', 'in', pacemaker_short_node_names_upgraded)
| map(attribute='value')
| list }}"
cacheable: no
- name: add the pacemaker short name to hiera data for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: pacemaker_short_node_names_override
tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
- name: add the pacemaker ips to hiera data for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: pacemaker_node_ips_override
tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
- name: remove the extra hiera data needed for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: remove.yml
vars:
tripleo_upgrade_key: "{{item}}"
loop:
- pacemaker_short_node_names_override
- pacemaker_node_ips_override
when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
- name: upgrade step 2
when: step|int == 2
vars:
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
block:
- name: Stop pacemaker cluster
pacemaker_cluster: state=offline
when: not upgrade_leapp_enabled
- name: upgrade step 4
when: step|int == 4
vars:
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
block:
- name: Start pacemaker cluster
pacemaker_cluster: state=online
when: not upgrade_leapp_enabled
external_upgrade_tasks:
- when:
- step|int == 1
tags:
- never
- system_upgrade_stop_services
- system_upgrade_transfer_data
block:
- name: Stop cluster
become: true
shell: |
set -eu
FILE=/usr/sbin/pcs
if test -f "$FILE"; then
/usr/sbin/pcs cluster stop --force
fi
delegate_to: "{{ item }}"
with_items: "{{ groups['pacemaker'] | default([]) }}"
update_tasks:
- name: Check pacemaker cluster running before the minor update
when: step|int == 0 # TODO(marios) disabling validations?
pacemaker_cluster: state=online check_and_fail=true
async: 30
poll: 4
- name: Move virtual IPs to another node before stopping pacemaker
when: step|int == 1
shell: |
CLUSTER_NODE=$(crm_node -n)
echo "Retrieving all the VIPs which are hosted on this node"
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
for v in ${VIPS_TO_MOVE}; do
echo "Moving VIP $v on another node"
pcs resource move $v --wait=300
done
echo "Removing the location constraints that were created to move the VIPs"
for v in ${VIPS_TO_MOVE}; do
echo "Removing location ban for VIP $v"
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
if [ -n "$ban_id" ]; then
pcs constraint remove ${ban_id}
else
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
fi
done
- name: Stop pacemaker cluster
when: step|int == 1
pacemaker_cluster: state=offline
- name: Start pacemaker cluster
when: step|int == 4
pacemaker_cluster: state=online