From fd508d1cab9574c2f5ea55dbed295dde08f8f8d8 Mon Sep 17 00:00:00 2001 From: Michele Baldessari Date: Fri, 10 Sep 2021 14:34:34 +0200 Subject: [PATCH] Guarantee that ovn-dbs-pcmk update_tasks are run when the cluster is up On particular role compositions, the code joining the update_tasks might order things differently then on a typical 3ctrl control plane and the ovn-dbs tasks at step1 (which require the cluster to be up) will happen after the pacemaker task at step1 which stops the cluster. So we can observe something like the following: 2021-09-10 10:05:13.370339 | 001c2891-506d-f833-ff5a-000000000954 | TASK | Change the bundle operation timeout 2021-09-10 10:05:14.136798 | 001c2891-506d-f833-ff5a-000000000954 | CHANGED | Change the bundle operation timeout | ovn-db-01 2021-09-10 10:05:14.137982 | 001c2891-506d-f833-ff5a-000000000954 | TIMING | Change the bundle operation timeout | ovn-db-01 | 0:00:54.808754 | 0.77s 2021-09-10 10:05:14.146853 | 001c2891-506d-f833-ff5a-000000000956 | TASK | Acquire the cluster shutdown lock to stop pacemaker cluster 2021-09-10 10:05:14.508085 | 001c2891-506d-f833-ff5a-000000000956 | CHANGED | Acquire the cluster shutdown lock to stop pacemaker cluster | ovn-db-01 2021-09-10 10:05:14.509257 | 001c2891-506d-f833-ff5a-000000000956 | TIMING | Acquire the cluster shutdown lock to stop pacemaker cluster | ovn-db-01 | 0:00:55.180032 | 0.36s 2021-09-10 10:05:14.518668 | 001c2891-506d-f833-ff5a-000000000957 | TASK | Stop pacemaker cluster 2021-09-10 10:05:18.559627 | 001c2891-506d-f833-ff5a-000000000957 | CHANGED | Stop pacemaker cluster | ovn-db-01 2021-09-10 10:05:18.560561 | 001c2891-506d-f833-ff5a-000000000957 | TIMING | Stop pacemaker cluster | ovn-db-01 | 0:00:59.231336 | 4.04s 2021-09-10 10:05:18.569161 | 001c2891-506d-f833-ff5a-000000000958 | TASK | Start pacemaker cluster 2021-09-10 10:05:18.627924 | 001c2891-506d-f833-ff5a-000000000958 | SKIPPED | Start pacemaker cluster | ovn-db-01 2021-09-10 10:05:18.628678 | 001c2891-506d-f833-ff5a-000000000958 | TIMING | Start pacemaker cluster | ovn-db-01 | 0:00:59.299453 | 0.06s 2021-09-10 10:05:18.637292 | 001c2891-506d-f833-ff5a-000000000959 | TASK | Release the cluster shutdown lock 2021-09-10 10:05:18.694945 | 001c2891-506d-f833-ff5a-000000000959 | SKIPPED | Release the cluster shutdown lock | ovn-db-01 2021-09-10 10:05:18.695717 | 001c2891-506d-f833-ff5a-000000000959 | TIMING | Release the cluster shutdown lock | ovn-db-01 | 0:00:59.366493 | 0.06s 2021-09-10 10:05:18.704368 | 001c2891-506d-f833-ff5a-00000000095a | TASK | Clear ovndb cluster pacemaker error 2021-09-10 10:05:19.368816 | 001c2891-506d-f833-ff5a-00000000095a | FATAL | Clear ovndb cluster pacemaker error | ovn-db-01 | error={"changed": true, "cmd": "pcs resource cleanup ovn-dbs-bundle", "delta": "0:00:00.399084", "end": "2021-09-10 10:05:20 .044985", "msg": "non-zero return code", "rc": 1, "start": "2021-09-10 10:05:19.645901", "stderr": "Error: Unable to forget failed operations of resource: ovn-dbs-bundle\nError connecting to the CIB manager: Transport endpoint is not connected\nError perf orming operation: Transport endpoint is not connected", "stderr_lines": ["Error: Unable to forget failed operations of resource: ovn-dbs-bundle", "Error connecting to the CIB manager: Transport endpoint is not connected", "Error performing operation: Tran sport endpoint is not connected"], "stdout": "", "stdout_lines": []} We cannot call pcs resource cleanup at step1, we must call it at step0 so we're guaranteed that the cluster is up, no matter how heat/ansible decide to order the update_tasks. Note: This is the short-term less-invasive fix. The mid-long term fix should be around verifying that we can now remove those workarounds that were implemented for OVN bugs. Closes-Bug: #1943254 Change-Id: Idd827f72c0033978db7b9a8ea6acec2086cda961 (cherry picked from commit ff3173bdcf304dc726f5b03d813fa2194e2c63cc) (cherry picked from commit 9cab0d721a572d23243c83c8b6eba9e946f376dd) --- deployment/ovn/ovn-dbs-pacemaker-puppet.yaml | 34 ++++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/deployment/ovn/ovn-dbs-pacemaker-puppet.yaml b/deployment/ovn/ovn-dbs-pacemaker-puppet.yaml index a38087863b..60e1215ef2 100644 --- a/deployment/ovn/ovn-dbs-pacemaker-puppet.yaml +++ b/deployment/ovn/ovn-dbs-pacemaker-puppet.yaml @@ -317,6 +317,23 @@ outputs: tripleo_ha_wrapper_puppet_debug: {get_param: ConfigDebug} update_tasks: + # When a schema change happens, the newer slaves don't connect + # back to the older master and end up timing out. So we clean + # up the error here until we get a fix for + # https://bugzilla.redhat.com/show_bug.cgi?id=1759974 + - name: Clear ovndb cluster pacemaker error + shell: "pcs resource cleanup ovn-dbs-bundle" + when: + - step|int == 0 + # Then we ban the resource for this node. It has no effect on + # the first two controllers, but when we reach the last one, + # it avoids a cut in the control plane as master get chosen in + # one of the updated Stopped ovn. They are in error, that why + # we need the cleanup just before. + - name: Ban ovndb resource on the current node. + shell: "pcs resource ban ovn-dbs-bundle $(hostname | cut -d. -f1)" + when: + - step|int == 0 - name: Tear-down non-HA ovn-dbs containers when: - step|int == 1 @@ -330,23 +347,6 @@ outputs: - ovn_north_db_server - ovn_south_db_server - ovn_northd - # When a schema change happens, the newer slaves don't connect - # back to the older master and end up timing out. So we clean - # up the error here until we get a fix for - # https://bugzilla.redhat.com/show_bug.cgi?id=1759974 - - name: Clear ovndb cluster pacemaker error - shell: "pcs resource cleanup ovn-dbs-bundle" - when: - - step|int == 1 - # Then we ban the resource for this node. It has no effect on - # the first two controllers, but when we reach the last one, - # it avoids a cut in the control plane as master get chosen in - # one of the updated Stopped ovn. They are in error, that why - # we need the cleanup just before. - - name: Ban ovndb resource on the current node. - shell: "pcs resource ban ovn-dbs-bundle $(hostname | cut -d. -f1)" - when: - - step|int == 1 - name: ovn-dbs fetch and retag container image for pacemaker when: - step|int == 3