heat_template_version: rocky description: > Pacemaker service configured with Puppet parameters: ServiceData: default: {} description: Dictionary packing service data type: json ServiceNetMap: default: {} description: Mapping of service_name -> network name. Typically set via parameter_defaults in the resource registry. This mapping overrides those in ServiceNetMapDefaults. type: json DefaultPasswords: default: {} type: json RoleName: default: '' description: Role name on which the service is applied type: string RoleParameters: default: {} description: Parameters specific to the role type: json EndpointMap: default: {} description: Mapping of service endpoint -> protocol. Typically set via parameter_defaults in the resource registry. type: json MonitoringSubscriptionPacemaker: default: 'overcloud-pacemaker' type: string CorosyncIPv6: default: false description: Enable IPv6 in Corosync type: boolean EnableFencing: default: false description: Whether to enable fencing in Pacemaker or not. type: boolean PacemakerTLSPriorities: type: string description: Pacemaker TLS Priorities default: '' PacemakerRemoteAuthkey: type: string description: The authkey for the pacemaker remote service. hidden: true PcsdPassword: type: string description: The password for the 'pcsd' user for pacemaker. hidden: true CorosyncSettleTries: type: number description: Number of tries for cluster settling. This has the same default as the pacemaker puppet module. Override to a smaller value when in need to replace a controller node. default: 360 FencingConfig: default: {} description: | Pacemaker fencing configuration. The JSON should have the following structure: { "devices": [ { "agent": "AGENT_NAME", "host_mac": "HOST_MAC_ADDRESS", "params": {"PARAM_NAME": "PARAM_VALUE"} } ] } For instance: { "devices": [ { "agent": "fence_xvm", "host_mac": "52:54:00:aa:bb:cc", "params": { "multicast_address": "225.0.0.12", "port": "baremetal_0", "manage_fw": true, "manage_key_file": true, "key_file": "/etc/fence_xvm.key", "key_file_password": "abcdef" } } ] } type: json PacemakerLoggingSource: type: json default: tag: system.pacemaker file: /var/log/pacemaker/pacemaker.log startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}" ContainerCli: type: string default: 'podman' description: CLI tool used to manage containers. constraints: - allowed_values: ['docker', 'podman'] EnableInstanceHA: default: false description: Whether to enable an Instance Ha configurarion or not. This setup requires the Compute role to have the PacemakerRemote service added to it. type: boolean PacemakerBundleOperationTimeout: type: string default: '' description: The timeout for start, monitor and stop operations run by the container resource agent, in seconds. When set to default '', the timeout comes from pacemaker's default operation timeouts (20s). When set to default and podman is used, force the timeout to 120s. constraints: - allowed_pattern: "([1-9][0-9]*s)?" parameter_groups: - label: deprecated description: | The following parameters are deprecated and will be removed. They should not be relied on for new deployments. If you have concerns regarding deprecated parameters, please contact the TripleO development team on IRC or the OpenStack mailing list. parameters: - CorosyncIPv6 conditions: pcmk_tls_priorities_empty: {equals: [{get_param: PacemakerTLSPriorities}, '']} pcmk_bundle_op_timeout_empty: {equals: [{get_param: PacemakerBundleOperationTimeout}, '']} podman_enabled: {equals: [{get_param: ContainerCli}, 'podman']} is_ipv6: equals: - {get_param: [ServiceData, net_ip_version_map, {get_param: [ServiceNetMap, PacemakerNetwork]}]} - 6 outputs: role_data: description: Role data for the Pacemaker role. value: service_name: pacemaker monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker} firewall_rules: '130 pacemaker tcp': proto: 'tcp' dport: - 2224 - 3121 - 21064 '131 pacemaker udp': proto: 'udp' dport: 5405 config_settings: map_merge: - pacemaker::corosync::cluster_name: 'tripleo_cluster' pacemaker::corosync::manage_fw: false pacemaker::resource_defaults::defaults: resource-stickiness: { value: INFINITY } corosync_token_timeout: 10000 pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries} pacemaker::resource::bundle::deep_compare: true pacemaker::resource::ip::deep_compare: true pacemaker::resource::ocf::deep_compare: true corosync_ipv6: {if: [is_ipv6, true, false]} tripleo::fencing::config: {get_param: FencingConfig} tripleo::fencing::deep_compare: true enable_fencing: {get_param: EnableFencing} hacluster_pwd: yaql: expression: $.data.passwords.where($ != '').first() data: passwords: - {get_param: PcsdPassword} - {get_param: [DefaultPasswords, pcsd_password]} tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey} tripleo::profile::base::pacemaker::pcsd_bind_addr: str_replace: template: "%{hiera('$NETWORK')}" params: $NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]} - if: - pcmk_tls_priorities_empty - {} - tripleo::pacemaker::tls_priorities: {get_param: PacemakerTLSPriorities} - if: - and: - pcmk_bundle_op_timeout_empty - not: podman_enabled - {} - tripleo::profile::base::pacemaker::resource_op_defaults: bundle: name: timeout value: if: - pcmk_bundle_op_timeout_empty - '120s' - {get_param: PacemakerBundleOperationTimeout} service_config_settings: rsyslog: tripleo_logging_sources_pacemaker: - {get_param: PacemakerLoggingSource} step_config: | include tripleo::profile::base::pacemaker host_prep_tasks: # Need this until https://bugzilla.redhat.com/show_bug.cgi?id=1857247 is fixed - name: Make sure python3-novaclient is installed when IHA is enabled package: name: python3-novaclient state: present when: {get_param: EnableInstanceHA} upgrade_tasks: - name: upgrade step 0 when: step|int == 0 block: # If performing an upgrade which requires operating system upgrading # a transfer data step needs to be run. During this step, the whole # pacemaker cluster is stopped so we can't check the cluster status. # Once the transfer_data step is executed, a flag file is stored. # This code checks the existence of this file to know if we should # avoid doing a normal pacemaker upgrade or not. As with the # operating system upgrade a new cluster will be created in which # the other nodes will be added. - name: check flag file existence in destination host stat: path: "/var/lib/tripleo/transfer-flags/var-lib-mysql" register: tripleo_transfer_flag_stat become: true delegate_to: "{{ mysql_short_bootstrap_node_name }}" - name: Set fact cluster_recreate set_fact: cluster_recreate: "{{ tripleo_transfer_flag_stat.stat.exists|bool }}" - name: Check pacemaker cluster running before upgrade tags: validation pacemaker_cluster: state=online check_and_fail=true async: 30 poll: 4 when: not cluster_recreate|bool - name: Create hiera data to upgrade pacemaker in a stepwise manner. when: - step|int == 1 - cluster_recreate|bool block: - name: set pacemaker upgrade node facts in a single-node environment set_fact: pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}" cacheable: no when: groups['pacemaker'] | length <= 1 - name: set pacemaker upgrade node facts from the limit option set_fact: pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}" cacheable: no when: - groups['pacemaker'] | length > 1 - item.split('.')[0] in ansible_limit.split(':') loop: "{{ pacemaker_short_node_names | default([]) }}" - fail: msg: > You can't upgrade pacemaker without staged upgrade. You need to use the limit option in order to do so. when: >- pacemaker_short_node_names_upgraded is not defined or pacemaker_short_node_names_upgraded | length == 0 - debug: msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}" - name: set pacemaker node ips fact from the names fact set_fact: # Generate matching IPs for the names, e.g. for these varaible values: # pacemaker_node_ips: [ "1", "2", "3" ] # pacemaker_short_node_names: [ "a", "b", "c" ] # pacemaker_short_node_names_override: [ "b" ] # it will set: # pacemaker_node_ips_override: [ "2" ] pacemaker_node_ips_upgraded: "{{ dict(pacemaker_short_node_names|zip(pacemaker_node_ips)) | dict2items | selectattr('key', 'in', pacemaker_short_node_names_upgraded) | map(attribute='value') | list }}" cacheable: no - name: add the pacemaker short name to hiera data for the upgrade. include_role: name: tripleo_upgrade_hiera tasks_from: set.yml vars: tripleo_upgrade_key: pacemaker_short_node_names_override tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}" - name: add the pacemaker ips to hiera data for the upgrade. include_role: name: tripleo_upgrade_hiera tasks_from: set.yml vars: tripleo_upgrade_key: pacemaker_node_ips_override tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}" - name: remove the extra hiera data needed for the upgrade. include_role: name: tripleo_upgrade_hiera tasks_from: remove.yml vars: tripleo_upgrade_key: "{{item}}" loop: - pacemaker_short_node_names_override - pacemaker_node_ips_override when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length - name: upgrade step 2 when: step|int == 2 block: - name: Stop pacemaker cluster pacemaker_cluster: state=offline when: not cluster_recreate|bool - name: upgrade step 4 when: step|int == 4 block: - name: Start pacemaker cluster pacemaker_cluster: state=online when: not cluster_recreate|bool external_upgrade_tasks: - when: - step|int == 1 tags: - never - system_upgrade_stop_services - system_upgrade_transfer_data block: - name: Stop cluster become: true shell: | set -eu FILE=/usr/sbin/pcs if test -f "$FILE"; then /usr/sbin/pcs cluster stop --force fi delegate_to: "{{ item }}" with_items: "{{ groups['pacemaker'] | default([]) }}" update_tasks: - name: Check pacemaker cluster running before the minor update when: step|int == 0 # TODO(marios) disabling validations? pacemaker_cluster: state=online check_and_fail=true async: 30 poll: 4 - name: Move virtual IPs to another node before stopping pacemaker when: - step|int == 1 - hostvars[inventory_hostname]["haproxy_node_names"]|default([])|length > 1 shell: | CLUSTER_NODE=$(crm_node -n) echo "Retrieving all the VIPs which are hosted on this node" VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g') for v in ${VIPS_TO_MOVE}; do echo "Moving VIP $v on another node" pcs resource ban $v ${CLUSTER_NODE} --wait=300 done echo "Removing the location constraints that were created to move the VIPs" for v in ${VIPS_TO_MOVE}; do echo "Removing location ban for VIP $v" ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -) if [ -n "$ban_id" ]; then pcs constraint remove ${ban_id} else echo "Could not retrieve and clear location constraint for VIP $v" 2>&1 fi done - name: Acquire the cluster shutdown lock to stop pacemaker cluster when: step|int == 1 command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --acquire - name: Stop pacemaker cluster when: step|int == 1 pacemaker_cluster: state=offline - name: Start pacemaker cluster when: step|int == 4 pacemaker_cluster: state=online - name: Release the cluster shutdown lock when: step|int == 4 command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --release