From 5968aeb3207b4c42cc65eb7a7d3a831c4c28d456 Mon Sep 17 00:00:00 2001 From: Michele Baldessari Date: Sat, 19 May 2018 16:55:35 +0200 Subject: [PATCH] Make sure remotes are fully up before proceeding We currently rely on 'verify_on_create => true' to make sure that pacemaker remotes up before proceeding to Step2 (during which a remote node is entitled to run pcs commands). So if the remote is still not fully up pcs commands can potentially fail on the remote nodes with errors like: Error: /Stage[main]/Tripleo::Profile::Pacemaker::Compute_instanceha /Pacemaker::Property[compute-instanceha-role-node-property] /Pcmk_property[property-overcloud-novacomputeiha-0-compute-instanceha-role]: Could not evaluate: backup_cib: Running: /usr/sbin/pcs cluster cib /var/lib/pacemaker/cib/puppet-cib-backup20180519-20162-ekt31x failed with code: 1 -> verify_on_create => true has an incorrect semantic currently as it does not really wait for a single resource to be fully up. Since implementing that properly will take quite a bit of work (given that pcs does not currently support single-resource state polling), for now we avoid using verify_on_create and we simply make sure the resource is started via an exec. Run 25 successful deployments with this (and the depends-on) patch. Closes-Bug: #1773754 Depends-On: I74994a7e52a7470ead7862dd9083074f807f7675 Change-Id: I9e5d5bb48fc7393df71d8b9eae200ad4ebaa6aa6 --- manifests/profile/base/pacemaker.pp | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/manifests/profile/base/pacemaker.pp b/manifests/profile/base/pacemaker.pp index ea48540dd..cc4695af5 100644 --- a/manifests/profile/base/pacemaker.pp +++ b/manifests/profile/base/pacemaker.pp @@ -174,13 +174,26 @@ class tripleo::profile::base::pacemaker ( if $pacemaker_master and count($remote_short_node_names) > 0 { # Creates a { "node" => "ip_address", ...} hash $remotes_hash = hash(zip($remote_short_node_names, $remote_node_ips)) - pacemaker::resource::remote { $remote_short_node_names: - remote_address => $remotes_hash[$title], - reconnect_interval => $remote_reconnect_interval, - op_params => "monitor interval=${remote_monitor_interval}", - verify_on_create => true, - tries => $remote_tries, - try_sleep => $remote_try_sleep, + $remote_short_node_names.each |String $remote_short_node| { + pacemaker::resource::remote { $remote_short_node: + remote_address => $remotes_hash[$remote_short_node], + reconnect_interval => $remote_reconnect_interval, + op_params => "monitor interval=${remote_monitor_interval}", + tries => $remote_tries, + try_sleep => $remote_try_sleep, + before => Exec["exec-wait-for-${remote_short_node}"], + notify => Exec["exec-wait-for-${remote_short_node}"], + } + $check_command = "pcs status | grep -q -e \"${remote_short_node}.*Started\"" + exec { "exec-wait-for-${remote_short_node}": + path => '/usr/sbin:/usr/bin:/sbin:/bin', + command => $check_command, + unless => $check_command, + timeout => 30, + tries => 180, + try_sleep => 10, + tag => 'remote_ready', + } } } }