tripleo-heat-templates/deployment/pacemaker/pacemaker-baremetal-puppet.yaml
Jose Luis Franco Arza b7fbb7b835 Check transfer data flag to skip pacemaker normal upgrade.
When the operating system upgrade is performed a
transfer data step must be executed (to allow the
db import once the operating system upgrades). During
this step the pacemaker cluster is stopped, so we can
create a new cluster with the newly OS upgraded node.
Therefore, as the pacemaker cluster is down we need
to skip some of the tasks which would be executed
during a normal upgrade (check pcs status, stop pcs and
start pcs) from the upgrade_tasks.

This patch removes the use of UpgradeLeappEnabled heat
parameter to identify this and uses the existence of
the flag file created during the transfer_data to skip
the normal pacemaker upgrade tasks by setting a new
fact, cluster_recreate, if this file exists.

Change-Id: Iba85e99f59258ce6ef4e05ccae737b9eeb6cfc57
2020-06-05 13:10:44 +02:00

364 lines
14 KiB
YAML

heat_template_version: rocky
description: >
Pacemaker service configured with Puppet
parameters:
ServiceData:
default: {}
description: Dictionary packing service data
type: json
ServiceNetMap:
default: {}
description: Mapping of service_name -> network name. Typically set
via parameter_defaults in the resource registry. This
mapping overrides those in ServiceNetMapDefaults.
type: json
DefaultPasswords:
default: {}
type: json
RoleName:
default: ''
description: Role name on which the service is applied
type: string
RoleParameters:
default: {}
description: Parameters specific to the role
type: json
EndpointMap:
default: {}
description: Mapping of service endpoint -> protocol. Typically set
via parameter_defaults in the resource registry.
type: json
MonitoringSubscriptionPacemaker:
default: 'overcloud-pacemaker'
type: string
CorosyncIPv6:
default: false
description: Enable IPv6 in Corosync
type: boolean
EnableFencing:
default: false
description: Whether to enable fencing in Pacemaker or not.
type: boolean
PacemakerTLSPriorities:
type: string
description: Pacemaker TLS Priorities
default: ''
PacemakerRemoteAuthkey:
type: string
description: The authkey for the pacemaker remote service.
hidden: true
PcsdPassword:
type: string
description: The password for the 'pcsd' user for pacemaker.
hidden: true
CorosyncSettleTries:
type: number
description: Number of tries for cluster settling. This has the
same default as the pacemaker puppet module. Override
to a smaller value when in need to replace a controller node.
default: 360
FencingConfig:
default: {}
description: |
Pacemaker fencing configuration. The JSON should have
the following structure:
{
"devices": [
{
"agent": "AGENT_NAME",
"host_mac": "HOST_MAC_ADDRESS",
"params": {"PARAM_NAME": "PARAM_VALUE"}
}
]
}
For instance:
{
"devices": [
{
"agent": "fence_xvm",
"host_mac": "52:54:00:aa:bb:cc",
"params": {
"multicast_address": "225.0.0.12",
"port": "baremetal_0",
"manage_fw": true,
"manage_key_file": true,
"key_file": "/etc/fence_xvm.key",
"key_file_password": "abcdef"
}
}
]
}
type: json
PacemakerLoggingSource:
type: json
default:
tag: system.pacemaker
file: /var/log/host/pacemaker.log
startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
ContainerCli:
type: string
default: 'podman'
description: CLI tool used to manage containers.
constraints:
- allowed_values: ['docker', 'podman']
PacemakerBundleOperationTimeout:
type: string
default: ''
description: The timeout for start, monitor and stop operations
run by the container resource agent, in seconds.
When set to default '', the timeout comes from
pacemaker's default operation timeouts (20s). When
set to default and podman is used, force the timeout
to 120s.
constraints:
- allowed_pattern: "([1-9][0-9]*s)?"
parameter_groups:
- label: deprecated
description: |
The following parameters are deprecated and will be removed. They should not
be relied on for new deployments. If you have concerns regarding deprecated
parameters, please contact the TripleO development team on IRC or the
OpenStack mailing list.
parameters:
- CorosyncIPv6
conditions:
pcmk_tls_priorities_empty: {equals: [{get_param: PacemakerTLSPriorities}, '']}
pcmk_bundle_op_timeout_empty: {equals: [{get_param: PacemakerBundleOperationTimeout}, '']}
podman_enabled: {equals: [{get_param: ContainerCli}, 'podman']}
is_ipv6:
equals:
- {get_param: [ServiceData, net_ip_version_map, {get_param: [ServiceNetMap, PacemakerNetwork]}]}
- 6
outputs:
role_data:
description: Role data for the Pacemaker role.
value:
service_name: pacemaker
monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
firewall_rules:
'130 pacemaker tcp':
proto: 'tcp'
dport:
- 2224
- 3121
- 21064
'131 pacemaker udp':
proto: 'udp'
dport: 5405
config_settings:
map_merge:
- pacemaker::corosync::cluster_name: 'tripleo_cluster'
pacemaker::corosync::manage_fw: false
pacemaker::resource_defaults::defaults:
resource-stickiness: { value: INFINITY }
corosync_token_timeout: 10000
pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
pacemaker::resource::bundle::deep_compare: true
pacemaker::resource::ip::deep_compare: true
pacemaker::resource::ocf::deep_compare: true
corosync_ipv6: {if: [is_ipv6, true, false]}
tripleo::fencing::config: {get_param: FencingConfig}
tripleo::fencing::deep_compare: true
enable_fencing: {get_param: EnableFencing}
hacluster_pwd:
yaql:
expression: $.data.passwords.where($ != '').first()
data:
passwords:
- {get_param: PcsdPassword}
- {get_param: [DefaultPasswords, pcsd_password]}
tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
tripleo::profile::base::pacemaker::pcsd_bind_addr:
str_replace:
template:
"%{hiera('$NETWORK')}"
params:
$NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
-
if:
- pcmk_tls_priorities_empty
- {}
- tripleo::pacemaker::tls_priorities: {get_param: PacemakerTLSPriorities}
-
if:
- and:
- pcmk_bundle_op_timeout_empty
- not: podman_enabled
- {}
- tripleo::profile::base::pacemaker::resource_op_defaults:
bundle:
name: timeout
value:
if:
- pcmk_bundle_op_timeout_empty
- '120s'
- {get_param: PacemakerBundleOperationTimeout}
service_config_settings:
rsyslog:
tripleo_logging_sources_pacemaker:
- {get_param: PacemakerLoggingSource}
step_config: |
include tripleo::profile::base::pacemaker
upgrade_tasks:
- name: upgrade step 0
when: step|int == 0
block:
# If performing an upgrade which requires operating system upgrading
# a transfer data step needs to be run. During this step, the whole
# pacemaker cluster is stopped so we can't check the cluster status.
# Once the transfer_data step is executed, a flag file is stored.
# This code checks the existence of this file to know if we should
# avoid doing a normal pacemaker upgrade or not. As with the
# operating system upgrade a new cluster will be created in which
# the other nodes will be added.
- name: check flag file existence in destination host
stat:
path: "/var/lib/tripleo/transfer-flags/var-lib-mysql"
register: tripleo_transfer_flag_stat
become: true
delegate_to: "{{ mysql_short_bootstrap_node_name }}"
- name: Set fact cluster_recreate
set_fact:
cluster_recreate: "{{ tripleo_transfer_flag_stat.stat.exists|bool }}"
- name: Check pacemaker cluster running before upgrade
tags: validation
pacemaker_cluster: state=online check_and_fail=true
async: 30
poll: 4
when: not cluster_recreate|bool
- name: Create hiera data to upgrade pacemaker in a stepwise manner.
when:
- step|int == 1
- cluster_recreate|bool
block:
- name: set pacemaker upgrade node facts in a single-node environment
set_fact:
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
cacheable: no
when: groups['pacemaker'] | length <= 1
- name: set pacemaker upgrade node facts from the limit option
set_fact:
pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
cacheable: no
when:
- groups['pacemaker'] | length > 1
- item.split('.')[0] in ansible_limit.split(':')
loop: "{{ pacemaker_short_node_names | default([]) }}"
- fail:
msg: >
You can't upgrade pacemaker without staged
upgrade. You need to use the limit option in order
to do so.
when: >-
pacemaker_short_node_names_upgraded is not defined or
pacemaker_short_node_names_upgraded | length == 0
- debug:
msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
- name: set pacemaker node ips fact from the names fact
set_fact:
# Generate matching IPs for the names, e.g. for these varaible values:
# pacemaker_node_ips: [ "1", "2", "3" ]
# pacemaker_short_node_names: [ "a", "b", "c" ]
# pacemaker_short_node_names_override: [ "b" ]
# it will set:
# pacemaker_node_ips_override: [ "2" ]
pacemaker_node_ips_upgraded: "{{
dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
| dict2items
| selectattr('key', 'in', pacemaker_short_node_names_upgraded)
| map(attribute='value')
| list }}"
cacheable: no
- name: add the pacemaker short name to hiera data for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: pacemaker_short_node_names_override
tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
- name: add the pacemaker ips to hiera data for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: set.yml
vars:
tripleo_upgrade_key: pacemaker_node_ips_override
tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
- name: remove the extra hiera data needed for the upgrade.
include_role:
name: tripleo_upgrade_hiera
tasks_from: remove.yml
vars:
tripleo_upgrade_key: "{{item}}"
loop:
- pacemaker_short_node_names_override
- pacemaker_node_ips_override
when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
- name: upgrade step 2
when: step|int == 2
block:
- name: Stop pacemaker cluster
pacemaker_cluster: state=offline
when: not cluster_recreate|bool
- name: upgrade step 4
when: step|int == 4
block:
- name: Start pacemaker cluster
pacemaker_cluster: state=online
when: not cluster_recreate|bool
external_upgrade_tasks:
- when:
- step|int == 1
tags:
- never
- system_upgrade_stop_services
- system_upgrade_transfer_data
block:
- name: Stop cluster
become: true
shell: |
set -eu
FILE=/usr/sbin/pcs
if test -f "$FILE"; then
/usr/sbin/pcs cluster stop --force
fi
delegate_to: "{{ item }}"
with_items: "{{ groups['pacemaker'] | default([]) }}"
update_tasks:
- name: Check pacemaker cluster running before the minor update
when: step|int == 0 # TODO(marios) disabling validations?
pacemaker_cluster: state=online check_and_fail=true
async: 30
poll: 4
- name: Move virtual IPs to another node before stopping pacemaker
when: step|int == 1
shell: |
CLUSTER_NODE=$(crm_node -n)
echo "Retrieving all the VIPs which are hosted on this node"
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
for v in ${VIPS_TO_MOVE}; do
echo "Moving VIP $v on another node"
pcs resource move $v --wait=300
done
echo "Removing the location constraints that were created to move the VIPs"
for v in ${VIPS_TO_MOVE}; do
echo "Removing location ban for VIP $v"
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
if [ -n "$ban_id" ]; then
pcs constraint remove ${ban_id}
else
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
fi
done
- name: Stop pacemaker cluster
when: step|int == 1
pacemaker_cluster: state=offline
- name: Start pacemaker cluster
when: step|int == 4
pacemaker_cluster: state=online