26e00764f7
The upgrade workflow to Stein has a guard task that
checks that the --limit option is being used when
running the overcloud upgrade run command, as the
upgrade needs to be performed node by node due to
the operating system upgrade. However, if the --limit
option is not passed, the upgrade tasks fails in the
task right before the guard, as that task already
references the undefined variable. So, it is needed
to invert the order so we fail at will in the guard
task.
Change-Id: I9ffddcaa52314c615362969757c94ebdf01a3b6d
Closes-Bug: #1861663
(cherry picked from commit bca35e611b
)
342 lines
13 KiB
YAML
342 lines
13 KiB
YAML
heat_template_version: rocky
|
|
|
|
description: >
|
|
Pacemaker service configured with Puppet
|
|
|
|
parameters:
|
|
ServiceData:
|
|
default: {}
|
|
description: Dictionary packing service data
|
|
type: json
|
|
ServiceNetMap:
|
|
default: {}
|
|
description: Mapping of service_name -> network name. Typically set
|
|
via parameter_defaults in the resource registry. This
|
|
mapping overrides those in ServiceNetMapDefaults.
|
|
type: json
|
|
DefaultPasswords:
|
|
default: {}
|
|
type: json
|
|
RoleName:
|
|
default: ''
|
|
description: Role name on which the service is applied
|
|
type: string
|
|
RoleParameters:
|
|
default: {}
|
|
description: Parameters specific to the role
|
|
type: json
|
|
EndpointMap:
|
|
default: {}
|
|
description: Mapping of service endpoint -> protocol. Typically set
|
|
via parameter_defaults in the resource registry.
|
|
type: json
|
|
MonitoringSubscriptionPacemaker:
|
|
default: 'overcloud-pacemaker'
|
|
type: string
|
|
CorosyncIPv6:
|
|
default: false
|
|
description: Enable IPv6 in Corosync
|
|
type: boolean
|
|
EnableFencing:
|
|
default: false
|
|
description: Whether to enable fencing in Pacemaker or not.
|
|
type: boolean
|
|
PacemakerTLSPriorities:
|
|
type: string
|
|
description: Pacemaker TLS Priorities
|
|
default: ''
|
|
PacemakerRemoteAuthkey:
|
|
type: string
|
|
description: The authkey for the pacemaker remote service.
|
|
hidden: true
|
|
PcsdPassword:
|
|
type: string
|
|
description: The password for the 'pcsd' user for pacemaker.
|
|
hidden: true
|
|
CorosyncSettleTries:
|
|
type: number
|
|
description: Number of tries for cluster settling. This has the
|
|
same default as the pacemaker puppet module. Override
|
|
to a smaller value when in need to replace a controller node.
|
|
default: 360
|
|
FencingConfig:
|
|
default: {}
|
|
description: |
|
|
Pacemaker fencing configuration. The JSON should have
|
|
the following structure:
|
|
{
|
|
"devices": [
|
|
{
|
|
"agent": "AGENT_NAME",
|
|
"host_mac": "HOST_MAC_ADDRESS",
|
|
"params": {"PARAM_NAME": "PARAM_VALUE"}
|
|
}
|
|
]
|
|
}
|
|
For instance:
|
|
{
|
|
"devices": [
|
|
{
|
|
"agent": "fence_xvm",
|
|
"host_mac": "52:54:00:aa:bb:cc",
|
|
"params": {
|
|
"multicast_address": "225.0.0.12",
|
|
"port": "baremetal_0",
|
|
"manage_fw": true,
|
|
"manage_key_file": true,
|
|
"key_file": "/etc/fence_xvm.key",
|
|
"key_file_password": "abcdef"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
type: json
|
|
PacemakerLoggingSource:
|
|
type: json
|
|
default:
|
|
tag: system.pacemaker
|
|
file: /var/log/host/pacemaker.log
|
|
startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
|
|
UpgradeLeappEnabled:
|
|
description: Use Leapp for operating system upgrade
|
|
type: boolean
|
|
default: true
|
|
ContainerCli:
|
|
type: string
|
|
default: 'podman'
|
|
description: CLI tool used to manage containers.
|
|
constraints:
|
|
- allowed_values: ['docker', 'podman']
|
|
PacemakerBundleOperationTimeout:
|
|
type: string
|
|
default: ''
|
|
description: The timeout for start, monitor and stop operations
|
|
run by the container resource agent, in seconds.
|
|
When set to default '', the timeout comes from
|
|
pacemaker's default operation timeouts (20s). When
|
|
set to default and podman is used, force the timeout
|
|
to 120s.
|
|
constraints:
|
|
- allowed_pattern: "([1-9][0-9]*s)?"
|
|
|
|
conditions:
|
|
pcmk_tls_priorities_empty: {equals: [{get_param: PacemakerTLSPriorities}, '']}
|
|
pcmk_bundle_op_timeout_empty: {equals: [{get_param: PacemakerBundleOperationTimeout}, '']}
|
|
podman_enabled: {equals: [{get_param: ContainerCli}, 'podman']}
|
|
|
|
outputs:
|
|
role_data:
|
|
description: Role data for the Pacemaker role.
|
|
value:
|
|
service_name: pacemaker
|
|
monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
|
|
config_settings:
|
|
map_merge:
|
|
- pacemaker::corosync::cluster_name: 'tripleo_cluster'
|
|
pacemaker::corosync::manage_fw: false
|
|
pacemaker::resource_defaults::defaults:
|
|
resource-stickiness: { value: INFINITY }
|
|
corosync_token_timeout: 10000
|
|
pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
|
|
pacemaker::resource::bundle::deep_compare: true
|
|
pacemaker::resource::ip::deep_compare: true
|
|
pacemaker::resource::ocf::deep_compare: true
|
|
tripleo.pacemaker.firewall_rules:
|
|
'130 pacemaker tcp':
|
|
proto: 'tcp'
|
|
dport:
|
|
- 2224
|
|
- 3121
|
|
- 21064
|
|
'131 pacemaker udp':
|
|
proto: 'udp'
|
|
dport: 5405
|
|
corosync_ipv6: {get_param: CorosyncIPv6}
|
|
tripleo::fencing::config: {get_param: FencingConfig}
|
|
tripleo::fencing::deep_compare: true
|
|
enable_fencing: {get_param: EnableFencing}
|
|
hacluster_pwd:
|
|
yaql:
|
|
expression: $.data.passwords.where($ != '').first()
|
|
data:
|
|
passwords:
|
|
- {get_param: PcsdPassword}
|
|
- {get_param: [DefaultPasswords, pcsd_password]}
|
|
tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
|
|
tripleo::profile::base::pacemaker::pcsd_bind_addr:
|
|
str_replace:
|
|
template:
|
|
"%{hiera('$NETWORK')}"
|
|
params:
|
|
$NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
|
|
-
|
|
if:
|
|
- pcmk_tls_priorities_empty
|
|
- {}
|
|
- tripleo::pacemaker::tls_priorities: {get_param: PacemakerTLSPriorities}
|
|
-
|
|
if:
|
|
- and:
|
|
- pcmk_bundle_op_timeout_empty
|
|
- not: podman_enabled
|
|
- {}
|
|
- tripleo::profile::base::pacemaker::resource_op_defaults:
|
|
bundle:
|
|
name: timeout
|
|
value:
|
|
if:
|
|
- pcmk_bundle_op_timeout_empty
|
|
- '120s'
|
|
- {get_param: PacemakerBundleOperationTimeout}
|
|
service_config_settings:
|
|
rsyslog:
|
|
tripleo_logging_sources_pacemaker:
|
|
- {get_param: PacemakerLoggingSource}
|
|
step_config: |
|
|
include ::tripleo::profile::base::pacemaker
|
|
upgrade_tasks:
|
|
- name: upgrade step 0
|
|
when: step|int == 0
|
|
vars:
|
|
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
|
|
block:
|
|
- name: Check pacemaker cluster running before upgrade
|
|
tags: validation
|
|
pacemaker_cluster: state=online check_and_fail=true
|
|
async: 30
|
|
poll: 4
|
|
when: not upgrade_leapp_enabled|bool
|
|
- name: upgrade step 1
|
|
when:
|
|
- step|int == 1
|
|
block:
|
|
- name: set pacemaker upgrade node facts in a single-node environment
|
|
set_fact:
|
|
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
|
|
cacheable: no
|
|
when: groups['pacemaker'] | length <= 1
|
|
- name: set pacemaker upgrade node facts from the limit option
|
|
set_fact:
|
|
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
|
|
cacheable: no
|
|
when:
|
|
- groups['pacemaker'] | length > 1
|
|
- item.split('.')[0] in ansible_limit.split(',')
|
|
loop: "{{ pacemaker_short_node_names }}"
|
|
- fail:
|
|
msg: >
|
|
You can't upgrade pacemaker without staged
|
|
upgrade. You need to use the limit option in order
|
|
to do so.
|
|
when: >-
|
|
pacemaker_short_node_names_upgraded is not defined or
|
|
pacemaker_short_node_names_upgraded | length == 0
|
|
- debug:
|
|
msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
|
|
- name: set pacemaker node ips fact from the names fact
|
|
set_fact:
|
|
# Generate matching IPs for the names, e.g. for these varaible values:
|
|
# pacemaker_node_ips: [ "1", "2", "3" ]
|
|
# pacemaker_short_node_names: [ "a", "b", "c" ]
|
|
# pacemaker_short_node_names_override: [ "b" ]
|
|
# it will set:
|
|
# pacemaker_node_ips_override: [ "2" ]
|
|
pacemaker_node_ips_upgraded: "{{
|
|
dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
|
|
| dict2items
|
|
| selectattr('key', 'in', pacemaker_short_node_names_upgraded)
|
|
| map(attribute='value')
|
|
| list }}"
|
|
cacheable: no
|
|
|
|
- name: add the pacemaker short name to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo-upgrade-hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: pacemaker_short_node_names_override
|
|
tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
|
|
- name: add the pacemaker ips to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo-upgrade-hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: pacemaker_node_ips_override
|
|
tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
|
|
- name: remove the extra hiera data needed for the upgrade.
|
|
include_role:
|
|
name: tripleo-upgrade-hiera
|
|
tasks_from: remove.yml
|
|
vars:
|
|
tripleo_upgrade_key: "{{item}}"
|
|
loop:
|
|
- pacemaker_short_node_names_override
|
|
- pacemaker_node_ips_override
|
|
when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
|
|
- name: upgrade step 2
|
|
when: step|int == 2
|
|
vars:
|
|
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
|
|
block:
|
|
- name: Stop pacemaker cluster
|
|
pacemaker_cluster: state=offline
|
|
when: not upgrade_leapp_enabled
|
|
- name: upgrade step 4
|
|
when: step|int == 4
|
|
vars:
|
|
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
|
|
block:
|
|
- name: Start pacemaker cluster
|
|
pacemaker_cluster: state=online
|
|
when: not upgrade_leapp_enabled
|
|
external_upgrade_tasks:
|
|
- when:
|
|
- step|int == 1
|
|
tags:
|
|
- never
|
|
- system_upgrade_stop_services
|
|
- system_upgrade_transfer_data
|
|
block:
|
|
- name: Stop cluster
|
|
become: true
|
|
shell: |
|
|
set -eu
|
|
FILE=/usr/sbin/pcs
|
|
if test -f "$FILE"; then
|
|
/usr/sbin/pcs cluster stop --force
|
|
fi
|
|
delegate_to: "{{ item }}"
|
|
with_items: "{{ groups['pacemaker'] | default([]) }}"
|
|
update_tasks:
|
|
- name: Check pacemaker cluster running before the minor update
|
|
when: step|int == 0 # TODO(marios) disabling validations?
|
|
pacemaker_cluster: state=online check_and_fail=true
|
|
async: 30
|
|
poll: 4
|
|
- name: Move virtual IPs to another node before stopping pacemaker
|
|
when: step|int == 1
|
|
shell: |
|
|
CLUSTER_NODE=$(crm_node -n)
|
|
echo "Retrieving all the VIPs which are hosted on this node"
|
|
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
|
|
for v in ${VIPS_TO_MOVE}; do
|
|
echo "Moving VIP $v on another node"
|
|
pcs resource move $v --wait=300
|
|
done
|
|
echo "Removing the location constraints that were created to move the VIPs"
|
|
for v in ${VIPS_TO_MOVE}; do
|
|
echo "Removing location ban for VIP $v"
|
|
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
|
|
if [ -n "$ban_id" ]; then
|
|
pcs constraint remove ${ban_id}
|
|
else
|
|
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
|
|
fi
|
|
done
|
|
- name: Stop pacemaker cluster
|
|
when: step|int == 1
|
|
pacemaker_cluster: state=offline
|
|
- name: Start pacemaker cluster
|
|
when: step|int == 4
|
|
pacemaker_cluster: state=online
|