Make sure remotes are fully up before proceeding
We currently rely on 'verify_on_create => true' to make sure that pacemaker remotes up before proceeding to Step2 (during which a remote node is entitled to run pcs commands). So if the remote is still not fully up pcs commands can potentially fail on the remote nodes with errors like: Error: /Stage[main]/Tripleo::Profile::Pacemaker::Compute_instanceha /Pacemaker::Property[compute-instanceha-role-node-property] /Pcmk_property[property-overcloud-novacomputeiha-0-compute-instanceha-role]: Could not evaluate: backup_cib: Running: /usr/sbin/pcs cluster cib /var/lib/pacemaker/cib/puppet-cib-backup20180519-20162-ekt31x failed with code: 1 -> verify_on_create => true has an incorrect semantic currently as it does not really wait for a single resource to be fully up. Since implementing that properly will take quite a bit of work (given that pcs does not currently support single-resource state polling), for now we avoid using verify_on_create and we simply make sure the resource is started via an exec. Run 25 successful deployments with this (and the depends-on) patch. Closes-Bug: #1773754 Depends-On: I74994a7e52a7470ead7862dd9083074f807f7675 Change-Id: I9e5d5bb48fc7393df71d8b9eae200ad4ebaa6aa6
This commit is contained in:
parent
de4a61d7d8
commit
5968aeb320
@ -174,13 +174,26 @@ class tripleo::profile::base::pacemaker (
|
||||
if $pacemaker_master and count($remote_short_node_names) > 0 {
|
||||
# Creates a { "node" => "ip_address", ...} hash
|
||||
$remotes_hash = hash(zip($remote_short_node_names, $remote_node_ips))
|
||||
pacemaker::resource::remote { $remote_short_node_names:
|
||||
remote_address => $remotes_hash[$title],
|
||||
reconnect_interval => $remote_reconnect_interval,
|
||||
op_params => "monitor interval=${remote_monitor_interval}",
|
||||
verify_on_create => true,
|
||||
tries => $remote_tries,
|
||||
try_sleep => $remote_try_sleep,
|
||||
$remote_short_node_names.each |String $remote_short_node| {
|
||||
pacemaker::resource::remote { $remote_short_node:
|
||||
remote_address => $remotes_hash[$remote_short_node],
|
||||
reconnect_interval => $remote_reconnect_interval,
|
||||
op_params => "monitor interval=${remote_monitor_interval}",
|
||||
tries => $remote_tries,
|
||||
try_sleep => $remote_try_sleep,
|
||||
before => Exec["exec-wait-for-${remote_short_node}"],
|
||||
notify => Exec["exec-wait-for-${remote_short_node}"],
|
||||
}
|
||||
$check_command = "pcs status | grep -q -e \"${remote_short_node}.*Started\""
|
||||
exec { "exec-wait-for-${remote_short_node}":
|
||||
path => '/usr/sbin:/usr/bin:/sbin:/bin',
|
||||
command => $check_command,
|
||||
unless => $check_command,
|
||||
timeout => 30,
|
||||
tries => 180,
|
||||
try_sleep => 10,
|
||||
tag => 'remote_ready',
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user