b7fbb7b835
When the operating system upgrade is performed a transfer data step must be executed (to allow the db import once the operating system upgrades). During this step the pacemaker cluster is stopped, so we can create a new cluster with the newly OS upgraded node. Therefore, as the pacemaker cluster is down we need to skip some of the tasks which would be executed during a normal upgrade (check pcs status, stop pcs and start pcs) from the upgrade_tasks. This patch removes the use of UpgradeLeappEnabled heat parameter to identify this and uses the existence of the flag file created during the transfer_data to skip the normal pacemaker upgrade tasks by setting a new fact, cluster_recreate, if this file exists. Change-Id: Iba85e99f59258ce6ef4e05ccae737b9eeb6cfc57
364 lines
14 KiB
YAML
364 lines
14 KiB
YAML
heat_template_version: rocky
|
|
|
|
description: >
|
|
Pacemaker service configured with Puppet
|
|
|
|
parameters:
|
|
ServiceData:
|
|
default: {}
|
|
description: Dictionary packing service data
|
|
type: json
|
|
ServiceNetMap:
|
|
default: {}
|
|
description: Mapping of service_name -> network name. Typically set
|
|
via parameter_defaults in the resource registry. This
|
|
mapping overrides those in ServiceNetMapDefaults.
|
|
type: json
|
|
DefaultPasswords:
|
|
default: {}
|
|
type: json
|
|
RoleName:
|
|
default: ''
|
|
description: Role name on which the service is applied
|
|
type: string
|
|
RoleParameters:
|
|
default: {}
|
|
description: Parameters specific to the role
|
|
type: json
|
|
EndpointMap:
|
|
default: {}
|
|
description: Mapping of service endpoint -> protocol. Typically set
|
|
via parameter_defaults in the resource registry.
|
|
type: json
|
|
MonitoringSubscriptionPacemaker:
|
|
default: 'overcloud-pacemaker'
|
|
type: string
|
|
CorosyncIPv6:
|
|
default: false
|
|
description: Enable IPv6 in Corosync
|
|
type: boolean
|
|
EnableFencing:
|
|
default: false
|
|
description: Whether to enable fencing in Pacemaker or not.
|
|
type: boolean
|
|
PacemakerTLSPriorities:
|
|
type: string
|
|
description: Pacemaker TLS Priorities
|
|
default: ''
|
|
PacemakerRemoteAuthkey:
|
|
type: string
|
|
description: The authkey for the pacemaker remote service.
|
|
hidden: true
|
|
PcsdPassword:
|
|
type: string
|
|
description: The password for the 'pcsd' user for pacemaker.
|
|
hidden: true
|
|
CorosyncSettleTries:
|
|
type: number
|
|
description: Number of tries for cluster settling. This has the
|
|
same default as the pacemaker puppet module. Override
|
|
to a smaller value when in need to replace a controller node.
|
|
default: 360
|
|
FencingConfig:
|
|
default: {}
|
|
description: |
|
|
Pacemaker fencing configuration. The JSON should have
|
|
the following structure:
|
|
{
|
|
"devices": [
|
|
{
|
|
"agent": "AGENT_NAME",
|
|
"host_mac": "HOST_MAC_ADDRESS",
|
|
"params": {"PARAM_NAME": "PARAM_VALUE"}
|
|
}
|
|
]
|
|
}
|
|
For instance:
|
|
{
|
|
"devices": [
|
|
{
|
|
"agent": "fence_xvm",
|
|
"host_mac": "52:54:00:aa:bb:cc",
|
|
"params": {
|
|
"multicast_address": "225.0.0.12",
|
|
"port": "baremetal_0",
|
|
"manage_fw": true,
|
|
"manage_key_file": true,
|
|
"key_file": "/etc/fence_xvm.key",
|
|
"key_file_password": "abcdef"
|
|
}
|
|
}
|
|
]
|
|
}
|
|
type: json
|
|
PacemakerLoggingSource:
|
|
type: json
|
|
default:
|
|
tag: system.pacemaker
|
|
file: /var/log/host/pacemaker.log
|
|
startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
|
|
ContainerCli:
|
|
type: string
|
|
default: 'podman'
|
|
description: CLI tool used to manage containers.
|
|
constraints:
|
|
- allowed_values: ['docker', 'podman']
|
|
PacemakerBundleOperationTimeout:
|
|
type: string
|
|
default: ''
|
|
description: The timeout for start, monitor and stop operations
|
|
run by the container resource agent, in seconds.
|
|
When set to default '', the timeout comes from
|
|
pacemaker's default operation timeouts (20s). When
|
|
set to default and podman is used, force the timeout
|
|
to 120s.
|
|
constraints:
|
|
- allowed_pattern: "([1-9][0-9]*s)?"
|
|
|
|
parameter_groups:
|
|
- label: deprecated
|
|
description: |
|
|
The following parameters are deprecated and will be removed. They should not
|
|
be relied on for new deployments. If you have concerns regarding deprecated
|
|
parameters, please contact the TripleO development team on IRC or the
|
|
OpenStack mailing list.
|
|
parameters:
|
|
- CorosyncIPv6
|
|
|
|
conditions:
|
|
pcmk_tls_priorities_empty: {equals: [{get_param: PacemakerTLSPriorities}, '']}
|
|
pcmk_bundle_op_timeout_empty: {equals: [{get_param: PacemakerBundleOperationTimeout}, '']}
|
|
podman_enabled: {equals: [{get_param: ContainerCli}, 'podman']}
|
|
is_ipv6:
|
|
equals:
|
|
- {get_param: [ServiceData, net_ip_version_map, {get_param: [ServiceNetMap, PacemakerNetwork]}]}
|
|
- 6
|
|
|
|
outputs:
|
|
role_data:
|
|
description: Role data for the Pacemaker role.
|
|
value:
|
|
service_name: pacemaker
|
|
monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
|
|
firewall_rules:
|
|
'130 pacemaker tcp':
|
|
proto: 'tcp'
|
|
dport:
|
|
- 2224
|
|
- 3121
|
|
- 21064
|
|
'131 pacemaker udp':
|
|
proto: 'udp'
|
|
dport: 5405
|
|
config_settings:
|
|
map_merge:
|
|
- pacemaker::corosync::cluster_name: 'tripleo_cluster'
|
|
pacemaker::corosync::manage_fw: false
|
|
pacemaker::resource_defaults::defaults:
|
|
resource-stickiness: { value: INFINITY }
|
|
corosync_token_timeout: 10000
|
|
pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
|
|
pacemaker::resource::bundle::deep_compare: true
|
|
pacemaker::resource::ip::deep_compare: true
|
|
pacemaker::resource::ocf::deep_compare: true
|
|
corosync_ipv6: {if: [is_ipv6, true, false]}
|
|
tripleo::fencing::config: {get_param: FencingConfig}
|
|
tripleo::fencing::deep_compare: true
|
|
enable_fencing: {get_param: EnableFencing}
|
|
hacluster_pwd:
|
|
yaql:
|
|
expression: $.data.passwords.where($ != '').first()
|
|
data:
|
|
passwords:
|
|
- {get_param: PcsdPassword}
|
|
- {get_param: [DefaultPasswords, pcsd_password]}
|
|
tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
|
|
tripleo::profile::base::pacemaker::pcsd_bind_addr:
|
|
str_replace:
|
|
template:
|
|
"%{hiera('$NETWORK')}"
|
|
params:
|
|
$NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
|
|
-
|
|
if:
|
|
- pcmk_tls_priorities_empty
|
|
- {}
|
|
- tripleo::pacemaker::tls_priorities: {get_param: PacemakerTLSPriorities}
|
|
-
|
|
if:
|
|
- and:
|
|
- pcmk_bundle_op_timeout_empty
|
|
- not: podman_enabled
|
|
- {}
|
|
- tripleo::profile::base::pacemaker::resource_op_defaults:
|
|
bundle:
|
|
name: timeout
|
|
value:
|
|
if:
|
|
- pcmk_bundle_op_timeout_empty
|
|
- '120s'
|
|
- {get_param: PacemakerBundleOperationTimeout}
|
|
service_config_settings:
|
|
rsyslog:
|
|
tripleo_logging_sources_pacemaker:
|
|
- {get_param: PacemakerLoggingSource}
|
|
step_config: |
|
|
include tripleo::profile::base::pacemaker
|
|
upgrade_tasks:
|
|
- name: upgrade step 0
|
|
when: step|int == 0
|
|
block:
|
|
# If performing an upgrade which requires operating system upgrading
|
|
# a transfer data step needs to be run. During this step, the whole
|
|
# pacemaker cluster is stopped so we can't check the cluster status.
|
|
# Once the transfer_data step is executed, a flag file is stored.
|
|
# This code checks the existence of this file to know if we should
|
|
# avoid doing a normal pacemaker upgrade or not. As with the
|
|
# operating system upgrade a new cluster will be created in which
|
|
# the other nodes will be added.
|
|
- name: check flag file existence in destination host
|
|
stat:
|
|
path: "/var/lib/tripleo/transfer-flags/var-lib-mysql"
|
|
register: tripleo_transfer_flag_stat
|
|
become: true
|
|
delegate_to: "{{ mysql_short_bootstrap_node_name }}"
|
|
- name: Set fact cluster_recreate
|
|
set_fact:
|
|
cluster_recreate: "{{ tripleo_transfer_flag_stat.stat.exists|bool }}"
|
|
- name: Check pacemaker cluster running before upgrade
|
|
tags: validation
|
|
pacemaker_cluster: state=online check_and_fail=true
|
|
async: 30
|
|
poll: 4
|
|
when: not cluster_recreate|bool
|
|
- name: Create hiera data to upgrade pacemaker in a stepwise manner.
|
|
when:
|
|
- step|int == 1
|
|
- cluster_recreate|bool
|
|
block:
|
|
- name: set pacemaker upgrade node facts in a single-node environment
|
|
set_fact:
|
|
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
|
|
cacheable: no
|
|
when: groups['pacemaker'] | length <= 1
|
|
- name: set pacemaker upgrade node facts from the limit option
|
|
set_fact:
|
|
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
|
|
cacheable: no
|
|
when:
|
|
- groups['pacemaker'] | length > 1
|
|
- item.split('.')[0] in ansible_limit.split(':')
|
|
loop: "{{ pacemaker_short_node_names | default([]) }}"
|
|
- fail:
|
|
msg: >
|
|
You can't upgrade pacemaker without staged
|
|
upgrade. You need to use the limit option in order
|
|
to do so.
|
|
when: >-
|
|
pacemaker_short_node_names_upgraded is not defined or
|
|
pacemaker_short_node_names_upgraded | length == 0
|
|
- debug:
|
|
msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
|
|
- name: set pacemaker node ips fact from the names fact
|
|
set_fact:
|
|
# Generate matching IPs for the names, e.g. for these varaible values:
|
|
# pacemaker_node_ips: [ "1", "2", "3" ]
|
|
# pacemaker_short_node_names: [ "a", "b", "c" ]
|
|
# pacemaker_short_node_names_override: [ "b" ]
|
|
# it will set:
|
|
# pacemaker_node_ips_override: [ "2" ]
|
|
pacemaker_node_ips_upgraded: "{{
|
|
dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
|
|
| dict2items
|
|
| selectattr('key', 'in', pacemaker_short_node_names_upgraded)
|
|
| map(attribute='value')
|
|
| list }}"
|
|
cacheable: no
|
|
|
|
- name: add the pacemaker short name to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: pacemaker_short_node_names_override
|
|
tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
|
|
- name: add the pacemaker ips to hiera data for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: set.yml
|
|
vars:
|
|
tripleo_upgrade_key: pacemaker_node_ips_override
|
|
tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
|
|
- name: remove the extra hiera data needed for the upgrade.
|
|
include_role:
|
|
name: tripleo_upgrade_hiera
|
|
tasks_from: remove.yml
|
|
vars:
|
|
tripleo_upgrade_key: "{{item}}"
|
|
loop:
|
|
- pacemaker_short_node_names_override
|
|
- pacemaker_node_ips_override
|
|
when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
|
|
- name: upgrade step 2
|
|
when: step|int == 2
|
|
block:
|
|
- name: Stop pacemaker cluster
|
|
pacemaker_cluster: state=offline
|
|
when: not cluster_recreate|bool
|
|
- name: upgrade step 4
|
|
when: step|int == 4
|
|
block:
|
|
- name: Start pacemaker cluster
|
|
pacemaker_cluster: state=online
|
|
when: not cluster_recreate|bool
|
|
external_upgrade_tasks:
|
|
- when:
|
|
- step|int == 1
|
|
tags:
|
|
- never
|
|
- system_upgrade_stop_services
|
|
- system_upgrade_transfer_data
|
|
block:
|
|
- name: Stop cluster
|
|
become: true
|
|
shell: |
|
|
set -eu
|
|
FILE=/usr/sbin/pcs
|
|
if test -f "$FILE"; then
|
|
/usr/sbin/pcs cluster stop --force
|
|
fi
|
|
delegate_to: "{{ item }}"
|
|
with_items: "{{ groups['pacemaker'] | default([]) }}"
|
|
update_tasks:
|
|
- name: Check pacemaker cluster running before the minor update
|
|
when: step|int == 0 # TODO(marios) disabling validations?
|
|
pacemaker_cluster: state=online check_and_fail=true
|
|
async: 30
|
|
poll: 4
|
|
- name: Move virtual IPs to another node before stopping pacemaker
|
|
when: step|int == 1
|
|
shell: |
|
|
CLUSTER_NODE=$(crm_node -n)
|
|
echo "Retrieving all the VIPs which are hosted on this node"
|
|
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
|
|
for v in ${VIPS_TO_MOVE}; do
|
|
echo "Moving VIP $v on another node"
|
|
pcs resource move $v --wait=300
|
|
done
|
|
echo "Removing the location constraints that were created to move the VIPs"
|
|
for v in ${VIPS_TO_MOVE}; do
|
|
echo "Removing location ban for VIP $v"
|
|
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
|
|
if [ -n "$ban_id" ]; then
|
|
pcs constraint remove ${ban_id}
|
|
else
|
|
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
|
|
fi
|
|
done
|
|
- name: Stop pacemaker cluster
|
|
when: step|int == 1
|
|
pacemaker_cluster: state=offline
|
|
- name: Start pacemaker cluster
|
|
when: step|int == 4
|
|
pacemaker_cluster: state=online
|