tripleo-heat-templates/deployment/pacemaker/pacemaker-baremetal-puppet.yaml

heat_template_version: wallaby

description: >
  Pacemaker service configured with Puppet

parameters:
  ServiceData:
    default: {}
    description: Dictionary packing service data
    type: json
  ServiceNetMap:
    default: {}
    description: Mapping of service_name -> network name. Typically set
                 via parameter_defaults in the resource registry. Use
                 parameter_merge_strategies to merge it with the defaults.
    type: json
  RoleName:
    default: ''
    description: Role name on which the service is applied
    type: string
  RoleParameters:
    default: {}
    description: Parameters specific to the role
    type: json
  EndpointMap:
    default: {}
    description: Mapping of service endpoint -> protocol. Typically set
                 via parameter_defaults in the resource registry.
    type: json
  MonitoringSubscriptionPacemaker:
    default: 'overcloud-pacemaker'
    type: string
  EnableFencing:
    default: false
    description: Whether to enable fencing in Pacemaker or not.
    type: boolean
  PacemakerTLSPriorities:
    type: string
    description: Pacemaker TLS Priorities
    default: ''
  PacemakerRemoteAuthkey:
    type: string
    description: The authkey for the pacemaker remote service.
    hidden: true
  PcsdPassword:
    type: string
    description: The password for the 'pcsd' user for pacemaker.
    hidden: true
  CorosyncSettleTries:
    type: number
    description: Number of tries for cluster settling. This has the
                 same default as the pacemaker puppet module. Override
                 to a smaller value when in need to replace a controller node.
    default: 360
  FencingConfig:
    default: {}
    description: |
      Pacemaker fencing configuration. The JSON should have
      the following structure:
        {
          "devices": [
            {
              "agent": "AGENT_NAME",
              "host_mac": "HOST_MAC_ADDRESS",
              "params": {"PARAM_NAME": "PARAM_VALUE"}
            }
          ]
        }
      For instance:
        {
          "devices": [
            {
              "agent": "fence_xvm",
              "host_mac": "52:54:00:aa:bb:cc",
              "params": {
                "multicast_address": "225.0.0.12",
                "port": "baremetal_0",
                "manage_fw": true,
                "manage_key_file": true,
                "key_file": "/etc/fence_xvm.key",
                "key_file_password": "abcdef"
              }
            }
          ]
        }
    type: json
  PacemakerLoggingSource:
    type: json
    default:
      tag: system.pacemaker
      file: /var/log/pacemaker/pacemaker.log
      startmsg.regex: "^[a-zA-Z]{3} [0-9]{2} [:0-9]{8}"
  ContainerCli:
    type: string
    default: 'podman'
    description: CLI tool used to manage containers.
    constraints:
      - allowed_values: ['podman']
  EnableInstanceHA:
    default: false
    description: Whether to enable an Instance Ha configurarion or not.
                 This setup requires the Compute role to have the
                 PacemakerRemote service added to it.
    type: boolean
  PacemakerBundleOperationTimeout:
    type: string
    default: '120s'
    description: The timeout for start, monitor and stop operations
                 run by the container resource agent, in seconds.
                 Pacemaker's default operation timeout is 20s which
                 can be reached with high disk IO so setting it to a
                 safer value of 120s.
    constraints:
      - allowed_pattern: "([1-9][0-9]*s)"
  PacemakerProperties:
    type: json
    description: |
      Pacemaker properties that are set at cluster creation
      step.  The JSON should have the following structure:
      {
        'stonith-timeout': {
           property: 'stonith-timeout'
           value: '120s'
         }
      }
    default: {}

conditions:
  pcmk_tls_priorities_set:
    not: {equals: [{get_param: PacemakerTLSPriorities}, '']}

outputs:
  role_data:
    description: Role data for the Pacemaker role.
    value:
      service_name: pacemaker
      monitoring_subscription: {get_param: MonitoringSubscriptionPacemaker}
      firewall_rules:
        '130 pacemaker tcp':
          proto: 'tcp'
          dport:
            - 2224
            - 3121
            - 21064
        '131 pacemaker udp':
           proto: 'udp'
           dport: 5405
      config_settings:
        pacemaker::corosync::cluster_name: 'tripleo_cluster'
        pacemaker::corosync::manage_fw: false
        pacemaker::resource_defaults::defaults:
          resource-stickiness: { value: INFINITY }
        corosync_token_timeout: 10000
        pacemaker::corosync::settle_tries: {get_param: CorosyncSettleTries}
        pacemaker::resource::bundle::deep_compare: true
        pacemaker::resource::ip::deep_compare: true
        pacemaker::resource::ocf::deep_compare: true
        tripleo::fencing::config: {get_param: FencingConfig}
        tripleo::fencing::deep_compare: true
        enable_fencing: {get_param: EnableFencing}
        hacluster_pwd: {get_param: PcsdPassword}
        tripleo::profile::base::pacemaker::cluster_properties: {get_param: PacemakerProperties}
        tripleo::profile::base::pacemaker::remote_authkey: {get_param: PacemakerRemoteAuthkey}
        tripleo::profile::base::pacemaker::pcsd_bind_addr:
          str_replace:
            template:
              "%{lookup('$NETWORK')}"
            params:
              $NETWORK: {get_param: [ServiceNetMap, PacemakerNetwork]}
        tripleo::pacemaker::tls_priorities:
          if:
            - pcmk_tls_priorities_set
            - {get_param: PacemakerTLSPriorities}
        tripleo::profile::base::pacemaker::resource_op_defaults:
          bundle:
            name: timeout
            value: {get_param: PacemakerBundleOperationTimeout}
      service_config_settings:
        rsyslog:
          tripleo_logging_sources_pacemaker:
            - {get_param: PacemakerLoggingSource}
      step_config: |
        include tripleo::profile::base::pacemaker
      host_prep_tasks:
        # Need this until https://bugzilla.redhat.com/show_bug.cgi?id=1857247 is fixed
        - name: Make sure python3-novaclient is installed when IHA is enabled
          package:
            name: python3-novaclient
            state: present
          when: {get_param: EnableInstanceHA}
      upgrade_tasks:
        # Since Wallaby, Redis is not deployed by defaut anymore
        - name: pre-check to ensure redis is removed if the service is disabled
          when:
            - step|int == 0
            - '"redis" not in enabled_services|list'
          tags:
            - never
            - system_upgrade
            - system_upgrade_prepare
          become: true
          # prior to OS upgrade, don't assume the cluster is running,
          # probe the CIB file directly instead
          shell: |
            export CIB_file=/var/lib/pacemaker/cib/cib.xml
            if crm_resource -r redis-bundle -q &>/dev/null; then
              echo "Redis resource present in pacemaker but disabled by default in TripleO" >&2
              echo "Delete the resource in pacemaker or enable the redis service before upgrading" >&2
              exit 1
            fi
        - name: upgrade step 0
          when: step|int == 0
          block:
            # If performing an upgrade which requires operating system upgrading
            # a transfer data step needs to be run. During this step, the whole
            # pacemaker cluster is stopped so we can't check the cluster status.
            # Once the transfer_data step is executed, a flag file is stored.
            # This code checks the existence of this file to know if we should
            # avoid doing a normal pacemaker upgrade or not. As with the
            # operating system upgrade a new cluster will be created in which
            # the other nodes will be added.
            - name: check flag file existence in destination host
              stat:
                path: "/var/lib/tripleo/transfer-flags/var-lib-mysql"
              register: tripleo_transfer_flag_stat
              become: true
              delegate_to: "{{ mysql_short_bootstrap_node_name }}"
            - name: Set fact cluster_recreate
              set_fact:
                cluster_recreate: "{{ tripleo_transfer_flag_stat.stat.exists|bool }}"
            - name: Check pacemaker cluster running before upgrade
              tags: validation
              # NOTE: We are intentionally not using the community version of
              # pacemaker_cluster here due to variances between the two:
              # https://bugs.launchpad.net/tripleo/+bug/1938967
              pacemaker_cluster: state=online check_and_fail=true
              async: 30
              poll: 4
              when: not cluster_recreate|bool
        - name: Create hiera data to upgrade pacemaker in a stepwise manner.
          when:
            - step|int == 1
            - cluster_recreate|bool
          block:
            - name: set pacemaker upgrade node facts in a single-node environment
              set_fact:
                pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names }}"
                cacheable: false
              when: groups['pacemaker'] | length <= 1
            - name: set pacemaker upgrade node facts from the limit option
              set_fact:
                pacemaker_short_node_names_upgraded: "{{ pacemaker_short_node_names_upgraded|default([]) + [item.split('.')[0]] }}"
                cacheable: false
              when:
                - groups['pacemaker'] | length > 1
                - item.split('.')[0] in ansible_limit.split(':')
              loop: "{{ pacemaker_short_node_names | default([]) }}"
            - fail:
                msg: >
                  You can't upgrade pacemaker without staged
                  upgrade. You need to use the limit option in order
                  to do so.
              when: >-
                pacemaker_short_node_names_upgraded is not defined or
                pacemaker_short_node_names_upgraded | length == 0
            - debug:
                msg: "Prepare pacemaker upgrade for {{ pacemaker_short_node_names_upgraded }}"
            - name: set pacemaker node ips fact from the names fact
              set_fact:
                # Generate matching IPs for the names, e.g. for these varaible values:
                #     pacemaker_node_ips: [ "1", "2", "3" ]
                #     pacemaker_short_node_names: [ "a", "b", "c" ]
                #     pacemaker_short_node_names_override: [ "b" ]
                # it will set:
                #     pacemaker_node_ips_override: [ "2" ]
                pacemaker_node_ips_upgraded: "{{
                  dict(pacemaker_short_node_names|zip(pacemaker_node_ips))
                  | dict2items
                  | selectattr('key', 'in', pacemaker_short_node_names_upgraded)
                  | map(attribute='value')
                  | list }}"
                cacheable: false

            - name: add the pacemaker short name to hiera data for the upgrade.
              include_role:
                name: tripleo_upgrade_hiera
                tasks_from: set.yml
              vars:
                tripleo_upgrade_key: pacemaker_short_node_names_override
                tripleo_upgrade_value: "{{pacemaker_short_node_names_upgraded}}"
            - name: add the pacemaker ips to hiera data for the upgrade.
              include_role:
                name: tripleo_upgrade_hiera
                tasks_from: set.yml
              vars:
                tripleo_upgrade_key: pacemaker_node_ips_override
                tripleo_upgrade_value: "{{pacemaker_node_ips_upgraded}}"
            - name: remove the extra hiera data needed for the upgrade.
              include_role:
                name: tripleo_upgrade_hiera
                tasks_from: remove.yml
              vars:
                tripleo_upgrade_key: "{{item}}"
              loop:
                - pacemaker_short_node_names_override
                - pacemaker_node_ips_override
              when: pacemaker_short_node_names_upgraded | length == pacemaker_short_node_names | length
        - name: upgrade step 2
          when: step|int == 2
          block:
            - name: Stop pacemaker cluster
              pacemaker_cluster: state=offline
              when: not cluster_recreate|bool
        - name: upgrade step 4
          when: step|int == 4
          block:
            - name: Start pacemaker cluster
              pacemaker_cluster: state=online
              when: not cluster_recreate|bool
      external_upgrade_tasks:
        - when:
            - step|int == 1
          tags:
            - never
            - system_upgrade_stop_services
            - system_upgrade_transfer_data
          block:
            - name: Stop cluster
              become: true
              shell: |
                set -eu
                FILE=/usr/sbin/pcs
                if test -f "$FILE"; then
                  /usr/sbin/pcs cluster stop --force
                fi
              delegate_to: "{{ item }}"
              with_items: "{{ groups['pacemaker'] | default([]) }}"
      update_tasks:
        - name: Check pacemaker cluster running before the minor update
          when: step|int == 0  # TODO(marios) disabling validations?
          # NOTE: We are intentionally not using the community version of
          # pacemaker_cluster here due to variances between the two:
          # https://bugs.launchpad.net/tripleo/+bug/1938967
          pacemaker_cluster: state=online check_and_fail=true
          async: 30
          poll: 4
        - name: Move virtual IPs to another node before stopping pacemaker
          when:
            - step|int == 1
            - hostvars[inventory_hostname]["haproxy_node_names"]|default([])|length > 1
          shell: |
            CLUSTER_NODE=$(crm_node -n)
            echo "Retrieving all the VIPs which are hosted on this node"
            VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
            for v in ${VIPS_TO_MOVE}; do
                echo "Moving VIP $v on another node"
                pcs resource ban $v ${CLUSTER_NODE} --wait=300
            done
            echo "Removing the location constraints that were created to move the VIPs"
            for v in ${VIPS_TO_MOVE}; do
                echo "Removing location ban for VIP $v"
                ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
                if [ -n "$ban_id" ]; then
                    pcs constraint remove ${ban_id}
                else
                    echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
                fi
            done
        - name: Acquire the cluster shutdown lock to stop pacemaker cluster
          when: step|int == 1
          command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --acquire
        - name: Stop pacemaker cluster
          when: step|int == 1
          pacemaker_cluster: state=offline
        - name: Start pacemaker cluster
          when: step|int == 4
          pacemaker_cluster: state=online
        - name: Release the cluster shutdown lock
          when: step|int == 4
          command: systemd-cat -t ha-shutdown /var/lib/container-config-scripts/pacemaker_mutex_shutdown.sh --release