From e75842e53eb4563734b55055384018b77d22072d Mon Sep 17 00:00:00 2001 From: Damien Ciabrini Date: Thu, 20 Aug 2020 14:06:13 +0200 Subject: [PATCH] HA: ensure scaling up galera does not cause promotion errors During scale up, two galera resources are being updated in the pacemaker cluster. Force a specific ordering in puppet to make sure the galera resource agent always picks up the up-to-date config when it starts new replicas. Closes-Bug: #1892530 Change-Id: Id40ac8c10fd0348ce4fd99ce319dab933312acfa (cherry picked from commit 16a6ba465d420b23da77bab2f64286037d1ced37) --- .../functions/pacemaker_bundle_replicas.rb | 23 +++++++++++++++++++ .../pacemaker/database/mysql_bundle.pp | 19 +++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 lib/puppet/functions/pacemaker_bundle_replicas.rb diff --git a/lib/puppet/functions/pacemaker_bundle_replicas.rb b/lib/puppet/functions/pacemaker_bundle_replicas.rb new file mode 100644 index 000000000..f7d529e58 --- /dev/null +++ b/lib/puppet/functions/pacemaker_bundle_replicas.rb @@ -0,0 +1,23 @@ +# Custom function to extract the current number of replicas for a pacemaker +# resource, as defined in the pacemaker cluster. +# Input is the name of a pacemaker bundle resource +# Output is the number of replicas for that resource or 0 if not found +Puppet::Functions.create_function(:'pacemaker_bundle_replicas') do + dispatch :pacemaker_bundle_replicas do + param 'String', :bundle + return_type 'Integer' + end + + def pacemaker_bundle_replicas(bundle) + # the name of the node holding the replicas attribute varies based on the + # container engine used (podman, docker...), so match via attributes instead + replicas = `cibadmin -Q | xmllint --xpath "string(//bundle[@id='#{bundle}']/*[boolean(@image) and boolean(@run-command)]/@replicas)" -` + + # post-condition: 0 in case the bundle does not exist or an error occurred + if $?.success? && !replicas.empty? + return Integer(replicas) + else + return 0 + end + end +end diff --git a/manifests/profile/pacemaker/database/mysql_bundle.pp b/manifests/profile/pacemaker/database/mysql_bundle.pp index fbd02c4a9..f2e9d955a 100644 --- a/manifests/profile/pacemaker/database/mysql_bundle.pp +++ b/manifests/profile/pacemaker/database/mysql_bundle.pp @@ -484,6 +484,7 @@ MYSQL_HOST=localhost\n", storage_maps => merge($storage_maps, $storage_maps_tls), container_backend => $container_backend, tries => $pcs_tries, + before => Exec['galera-ready'], } pacemaker::resource::ocf { 'galera': @@ -499,12 +500,26 @@ MYSQL_HOST=localhost\n", expression => ['galera-role eq true'], }, bundle => 'galera-bundle', - require => [Class['::mysql::server'], - Pacemaker::Resource::Bundle['galera-bundle']], + require => [Class['::mysql::server']], before => Exec['galera-ready'], force => $force_ocf, } + # Resource relation: we normally want the bundle resource to + # be run before the OCF one, as the latter depends on the former + # at creation time. + # However during scale up, both resources change, and the bundle + # one shouldn't be updated prior to the OCF one, otherwise + # pacemaker could spawn additional replicas before the necessary + # info is updated in the OCF resource, which would confuse the + # galera resource agent and cause spurious errors. + $replicas=pacemaker_bundle_replicas('galera-bundle') + if ($replicas > 0) and ($galera_nodes_count > $replicas) { + Pacemaker::Resource::Ocf['galera'] -> Pacemaker::Resource::Bundle['galera-bundle'] + } else { + Pacemaker::Resource::Bundle['galera-bundle'] -> Pacemaker::Resource::Ocf['galera'] + } + exec { 'galera-ready' : command => '/usr/bin/clustercheck >/dev/null', timeout => 30,