bca35e611b
The upgrade workflow to Stein has a guard task that checks that the --limit option is being used when running the overcloud upgrade run command, as the upgrade needs to be performed node by node due to the operating system upgrade. However, if the --limit option is not passed, the upgrade tasks fails in the task right before the guard, as that task already references the undefined variable. So, it is needed to invert the order so we fail at will in the guard task. Change-Id: I9ffddcaa52314c615362969757c94ebdf01a3b6d Closes-Bug: #1861663
342 lines
13 KiB
YAML
342 lines
13 KiB
YAML
heat_template_version: rocky
|
|
|
|
description: >
|
|
Pacemaker service configured with Puppet
|
|
|
|
parameters:
|
|
ServiceData:
|
|
default: {}
|
|
description: Dictionary packing service data
|
|
type: json
|
|
ServiceNetMap:
|
|
default: {}
|
|
description: Mapping of service_name -> network name. Typically set
|
|
via parameter_defaults in the resource registry. This
|
|
mapping overrides those in ServiceNetMapDefaults.
|
|
type: json
|
|
DefaultPasswords:
|
|
default: {}
|
|
type: json
|
|
RoleName:
|
|
default: ''
|
|
description: Role name on which the service is applied
|
|
type: string
|
|
RoleParameters:
|
|
default: {}
|
|
description: Parameters specific to the role
|
|
type: json
|
|
EndpointMap:
|
|
default: {}
|
|
description: Mapping of service endpoint -> protocol. Typically set
|
|
via parameter_defaults in the resource registry.
|
|
type: json
|
|
MonitoringSubscriptionPacemaker:
|
|
default: 'overcloud-pacemaker'
|
|
type: string
|
|
CorosyncIPv6:
|
|
default: false
|
|
description: Enable IPv6 in Corosync
|
|
type: boolean
|
|
EnableFencing:
|
|
default: false
|
|
description: Whether to enable fencing in Pacemaker or not.
|
|
type: boolean
|
|
PacemakerTLSPriorities:
|
|
type: string
|
|
description: Pacemaker TLS Priorities
|
|
default: ''
|
|
PacemakerRemoteAuthkey:
|
|
type: string
|
|
description: The authkey for the pacemaker remote service.
|
|
hidden: true
|
|
PcsdPassword:
|
|
type: string
|
|
description: The password for the 'pcsd' user for pacemaker.
|
|
hidden: true
|
|
CorosyncSettleTries:
|
|
type: number
|
|
description: Number of tries for cluster settling. This has the
|
|
same default as the pacemaker puppet module. Override
|
|
to a smaller value when in need to replace a controller node.
|
|
default: 360
|
|
FencingConfig:
|
|
default: {}
|
|
description: |
|
|
Pacemaker fencing configuration. The JSON should have
|
|
the following structure:
|
|
{
|
|
"devices": [
|
|
{
|
|
"agent": "AGENT_NAME",
|
|
"host_mac": "HOST_MAC_ADDRESS",
|
|
"params": {"PARAM_NAME": "PARAM_VALUE"}
|
|
}
|
|
]
|
|
}
|
|
For instance:
|
|
{
|
|
"devices": [
|
|
{
|
|
"agent": "fence_xvm",
|
|
"host_mac": "52:54:00:aa:bb:cc",
|
|
"params": {
|
|
"multicast_address": "225.0.0.12",
|
|
"port": "baremetal_0",
|
|
"manage_fw": true,
|
|
"manage_key_file": true,
|
|
"key_file": "/etc/fence_xvm.key",
|
|
"key_file_password": "abcdef"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
type: json
|
|
PacemakerLoggingSource:
|
|
type: json
|
|
default:
|
|
tag: system.pacemaker
|
|
file: /var/log/host/pacemaker.log
|
|
startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
|
|
UpgradeLeappEnabled:
|
|
description: Use Leapp for operating system upgrade
|
|
type: boolean
|
|
default: true
|
|
ContainerCli:
|
|
type: string
|
|
default: 'podman'
|
|
description: CLI tool used to manage containers.
|
|
constraints:
|
|
- allowed_values: ['docker', 'podman']
|
|
PacemakerBundleOperationTimeout:
|
|
type: string
|
|
default: ''
|
|
description: The timeout for start, monitor and stop operations
|
|
run by the container resource agent, in seconds.
|
|
When set to default '', the timeout comes from
|
|
pacemaker's default operation timeouts (20s). When
|
|
set to default and podman is used, force the timeout
|
|
to 120s.
|
|
constraints:
|
|
- allowed_pattern: "([1-9][0-9]*s)?"
|
|
|
|
conditions:
|
|
pcmk_tls_priorities_empty: {equals: [{get_param: PacemakerTLSPriorities}, '']}
|
|
pcmk_bundle_op_timeout_empty: {equals: [{get_param: PacemakerBundleOperationTimeout}, '']}
|
|
podman_enabled: {equals: [{get_param: ContainerCli}, 'podman']}
|
|
|
|
outputs:
|
|
role_data:
|
|
description: Role data for the Pacemaker role.
|
|
value:
|
|
service_name: pacemaker
|
|
monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
|
|
firewall_rules:
|
|
'130 pacemaker tcp':
|
|
proto: 'tcp'
|
|
dport:
|
|
- 2224
|
|
- 3121
|
|
- 21064
|
|
'131 pacemaker udp':
|
|
proto: 'udp'
|
|
dport: 5405
|
|
config_settings:
|
|
map_merge:
|
|
- pacemaker::corosync::cluster_name: 'tripleo_cluster'
|
|
pacemaker::corosync::manage_fw: false
|
|
pacemaker::resource_defaults::defaults:
|
|
resource-stickiness: { value: INFINITY }
|
|
corosync_token_timeout: 10000
|
|
pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
|
|
pacemaker::resource::bundle::deep_compare: true
|
|
pacemaker::resource::ip::deep_compare: true
|
|
pacemaker::resource::ocf::deep_compare: true
|
|
corosync_ipv6: {get_param: CorosyncIPv6}
|
|
tripleo::fencing::config: {get_param: FencingConfig}
|
|
tripleo::fencing::deep_compare: true
|
|
enable_fencing: {get_param: EnableFencing}
|
|
hacluster_pwd:
|
|
yaql:
|
|
expression: $.data.passwords.where($ != '').first()
|
|
data:
|
|
passwords:
|
|
- {get_param: PcsdPassword}
|
|
- {get_param: [DefaultPasswords, pcsd_password]}
|
|
tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
|
|
tripleo::profile::base::pacemaker::pcsd_bind_addr:
|
|
str_replace:
|
|
template:
|
|
"%{hiera('$NETWORK')}"
|
|
params:
|
|
$NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
|
|
-
|
|
if:
|
|
- pcmk_tls_priorities_empty
|
|
- {}
|
|
- tripleo::pacemaker::tls_priorities: {get_param: PacemakerTLSPriorities}
|
|
-
|
|
if:
|
|
- and:
|
|
- pcmk_bundle_op_timeout_empty
|
|
- not: podman_enabled
|
|
- {}
|
|
- tripleo::profile::base::pacemaker::resource_op_defaults:
|
|
bundle:
|
|
name: timeout
|
|
value:
|
|
if:
|
|
- pcmk_bundle_op_timeout_empty
|
|
- '120s'
|
|
- {get_param: PacemakerBundleOperationTimeout}
|
|
service_config_settings:
|
|
rsyslog:
|
|
tripleo_logging_sources_pacemaker:
|
|
- {get_param: PacemakerLoggingSource}
|
|
step_config: |
|
|
include ::tripleo::profile::base::pacemaker
|
|
upgrade_tasks:
|
|
- name: upgrade step 0
|
|
when: step|int == 0
|
|
vars:
|
|
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
|
|
block:
|
|
- name: Check pacemaker cluster running before upgrade
|
|
tags: validation
|
|
pacemaker_cluster: state=online check_and_fail=true
|
|
async: 30
|
|
poll: 4
|
|
when: not upgrade_leapp_enabled|bool
|
|
- name: upgrade step 1
|
|
when:
|
|
- step|int == 1
|
|
block:
|
|
- name: set pacemaker upgrade node facts in a single-node environment
|
|
set_fact:
|
|
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
|
|
cacheable: no
|
|
when: groups['pacemaker'] | length <= 1
|
|
- name: set pacemaker upgrade node facts from the limit option
|
|
set_fact:
|
|
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
|
|
cacheable: no
|
|
when:
|
|
- groups['pacemaker'] | length > 1
|
|
- item.split('.')[0] in ansible_limit.split(',')
|
|
loop: "{{ pacemaker_short_node_names }}"
|
|
- fail:
|
|
msg: >
|
|
You can't upgrade pacemaker without staged
|
|
upgrade. You need to use the limit option in order
|
|
to do so.
|
|
when: >-
|
|
pacemaker_short_node_names_upgraded is not defined or
|
|
pacemaker_short_node_names_upgraded | length == 0
|
|
- debug:
|
|
msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
|
|
- name: set pacemaker node ips fact from the names fact
|
|
set_fact:
|
|
# Generate matching IPs for the names, e.g. for these varaible values:
|
|
# pacemaker_node_ips: [ "1", "2", "3" ]
|
|
# pacemaker_short_node_names: [ "a", "b", "c" ]
|
|
# pacemaker_short_node_names_override: [ "b" ]
|
|
# it will set:
|
|
# pacemaker_node_ips_override: [ "2" ]
|
|
pacemaker_node_ips_upgraded: "{{
|
|
dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
|
|
| dict2items
|
|
| selectattr('key', 'in', pacemaker_short_node_names_upgraded)
|
|
| map(attribute='value')
|
|
| list }}"
|
|
cacheable: no
|
|
|
|
- name: add the pacemaker short name to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: pacemaker_short_node_names_override
|
|
tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
|
|
- name: add the pacemaker ips to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: pacemaker_node_ips_override
|
|
tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
|
|
- name: remove the extra hiera data needed for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: remove.yml
|
|
vars:
|
|
tripleo_upgrade_key: "{{item}}"
|
|
loop:
|
|
- pacemaker_short_node_names_override
|
|
- pacemaker_node_ips_override
|
|
when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
|
|
- name: upgrade step 2
|
|
when: step|int == 2
|
|
vars:
|
|
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
|
|
block:
|
|
- name: Stop pacemaker cluster
|
|
pacemaker_cluster: state=offline
|
|
when: not upgrade_leapp_enabled
|
|
- name: upgrade step 4
|
|
when: step|int == 4
|
|
vars:
|
|
upgrade_leapp_enabled: {get_param: UpgradeLeappEnabled}
|
|
block:
|
|
- name: Start pacemaker cluster
|
|
pacemaker_cluster: state=online
|
|
when: not upgrade_leapp_enabled
|
|
external_upgrade_tasks:
|
|
- when:
|
|
- step|int == 1
|
|
tags:
|
|
- never
|
|
- system_upgrade_stop_services
|
|
- system_upgrade_transfer_data
|
|
block:
|
|
- name: Stop cluster
|
|
become: true
|
|
shell: |
|
|
set -eu
|
|
FILE=/usr/sbin/pcs
|
|
if test -f "$FILE"; then
|
|
/usr/sbin/pcs cluster stop --force
|
|
fi
|
|
delegate_to: "{{ item }}"
|
|
with_items: "{{ groups['pacemaker'] | default([]) }}"
|
|
update_tasks:
|
|
- name: Check pacemaker cluster running before the minor update
|
|
when: step|int == 0 # TODO(marios) disabling validations?
|
|
pacemaker_cluster: state=online check_and_fail=true
|
|
async: 30
|
|
poll: 4
|
|
- name: Move virtual IPs to another node before stopping pacemaker
|
|
when: step|int == 1
|
|
shell: |
|
|
CLUSTER_NODE=$(crm_node -n)
|
|
echo "Retrieving all the VIPs which are hosted on this node"
|
|
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
|
|
for v in ${VIPS_TO_MOVE}; do
|
|
echo "Moving VIP $v on another node"
|
|
pcs resource move $v --wait=300
|
|
done
|
|
echo "Removing the location constraints that were created to move the VIPs"
|
|
for v in ${VIPS_TO_MOVE}; do
|
|
echo "Removing location ban for VIP $v"
|
|
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
|
|
if [ -n "$ban_id" ]; then
|
|
pcs constraint remove ${ban_id}
|
|
else
|
|
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
|
|
fi
|
|
done
|
|
- name: Stop pacemaker cluster
|
|
when: step|int == 1
|
|
pacemaker_cluster: state=offline
|
|
- name: Start pacemaker cluster
|
|
when: step|int == 4
|
|
pacemaker_cluster: state=online
|