From 72b53f1d5b20bf936a939b9a46d0f5aa289413e1 Mon Sep 17 00:00:00 2001 From: Juan Larriba Date: Fri, 14 May 2021 14:31:46 +0200 Subject: [PATCH] Make execution of the backup idempotent Currently, the playbook stops the services and the pacemaker to perform the backup. If the backup fails for whatever reason, the playbook cannot run again and fails due to the services being down. This patch fixes the issue in two directions. First, by using the block-always ansible feature, we ensure that whenever an error happens, the playbook will restart the pacemaker and the services before exiting. The second direction is on the execution itself. By controlling certain actions, we ensure that the backup will be succesful not matter what state the services and the pacemaker are when the playbook is ran. This ensures success even in the case an operator stops the ansible execution manually (which will not trigger the 'always' condition). BZ: #1954818 Change-Id: Id2aff61f219b0c4992f6f0045f1aba2c7d129758 (cherry picked from commit 9a865d176987f82ecafe121f21ba8db9641379d2) --- .../tasks/ceph_authentication.yml | 6 +++ .../backup_and_restore/tasks/db_backup.yml | 41 ++++++++++++++++++- .../roles/backup_and_restore/tasks/main.yml | 24 +++++++---- .../tasks/pacemaker_unstandby.yml | 30 ++++++++++++++ .../backup_and_restore/tasks/run_backup.yml | 20 --------- .../tasks/service_manager_pause.yml | 1 + 6 files changed, 92 insertions(+), 30 deletions(-) create mode 100644 tripleo_ansible/roles/backup_and_restore/tasks/pacemaker_unstandby.yml diff --git a/tripleo_ansible/roles/backup_and_restore/tasks/ceph_authentication.yml b/tripleo_ansible/roles/backup_and_restore/tasks/ceph_authentication.yml index 252017208..2fadd55cc 100644 --- a/tripleo_ansible/roles/backup_and_restore/tasks/ceph_authentication.yml +++ b/tripleo_ansible/roles/backup_and_restore/tasks/ceph_authentication.yml @@ -32,6 +32,12 @@ tags: - always +- name: Unpause ceph mon container if paused + command: "{{ tripleo_container_cli }} unpause ceph-mon-{{ ansible_facts['hostname'] }}" + failed_when: false + tags: + - bar_create_recover_image + - name: Export ceph authentication shell: | set -o pipefail diff --git a/tripleo_ansible/roles/backup_and_restore/tasks/db_backup.yml b/tripleo_ansible/roles/backup_and_restore/tasks/db_backup.yml index b9d36e662..2963e780d 100644 --- a/tripleo_ansible/roles/backup_and_restore/tasks/db_backup.yml +++ b/tripleo_ansible/roles/backup_and_restore/tasks/db_backup.yml @@ -48,6 +48,31 @@ tags: - bar_create_recover_image +- name: Enable pacemaker if it is stopped + command: pcs cluster start --all + when: + - enabled_galera + - tripleo_backup_and_restore_service_manager|bool + - not tripleo_backup_and_restore_enable_snapshots|bool + run_once: true + tags: + - bar_create_recover_image + +- name: Wait until pacemaker has Galera up&running + shell: | + set -o pipefail + ss -tunlp | grep ":3306 " | sed -e 's/.*\///' + register: mysql_result + retries: 300 + until: mysql_result is search('mysqld') + delay: 5 + when: + - enabled_galera + - tripleo_backup_and_restore_service_manager|bool + - not tripleo_backup_and_restore_enable_snapshots|bool + tags: + - bar_create_recover_image + - name: Get the mysql container id when galera is enabled shell: | set -o pipefail @@ -65,6 +90,17 @@ tags: - bar_create_recover_image +- name: Unpause mysql for backup if it is paused + command: "{{ tripleo_container_cli }} unpause {{ tripleo_backup_and_restore_mysql_container }}" + when: + - mysql_password.stderr is defined + - tripleo_backup_and_restore_mysql_container == "mysql" + - not enabled_galera + - tripleo_backup_and_restore_service_manager|bool + failed_when: false + tags: + - bar_create_recover_image + - name: MySQL Grants backup shell: | set -o pipefail @@ -98,7 +134,8 @@ - tripleo_backup_and_restore_mysql_container == "mysql" - not enabled_galera - tripleo_backup_and_restore_service_manager|bool - - not tripleo_backup_and_restore_enable_snapshots + - not tripleo_backup_and_restore_enable_snapshots|bool + failed_when: false tags: - bar_create_recover_image @@ -107,7 +144,7 @@ when: - enabled_galera - tripleo_backup_and_restore_service_manager|bool - - not tripleo_backup_and_restore_enable_snapshots + - not tripleo_backup_and_restore_enable_snapshots|bool run_once: true tags: - bar_create_recover_image diff --git a/tripleo_ansible/roles/backup_and_restore/tasks/main.yml b/tripleo_ansible/roles/backup_and_restore/tasks/main.yml index 534875ca3..c305eea87 100644 --- a/tripleo_ansible/roles/backup_and_restore/tasks/main.yml +++ b/tripleo_ansible/roles/backup_and_restore/tasks/main.yml @@ -42,7 +42,7 @@ import_tasks: service_manager_pause.yml when: - tripleo_backup_and_restore_service_manager|bool - - not tripleo_backup_and_restore_enable_snapshots + - not tripleo_backup_and_restore_enable_snapshots|bool - name: Backup the database import_tasks: db_backup.yml @@ -50,11 +50,19 @@ - name: Backup pacemaker configuration import_tasks: pacemaker_backup.yml -- name: Create recovery images with ReaR - import_tasks: run_backup.yml +- name: Perform backup + block: + - name: Create recovery images with ReaR + import_tasks: run_backup.yml + always: + - name: Service management + import_tasks: service_manager_unpause.yml + when: + - tripleo_backup_and_restore_service_manager|bool + - not tripleo_backup_and_restore_enable_snapshots|bool -- name: Service management - import_tasks: service_manager_unpause.yml - when: - - tripleo_backup_and_restore_service_manager|bool - - not tripleo_backup_and_restore_enable_snapshots + - name: Pacemaker management + import_tasks: pacemaker_unstandby.yml + when: + - pacemaker_enabled + - tripleo_backup_and_restore_enable_snapshots|bool diff --git a/tripleo_ansible/roles/backup_and_restore/tasks/pacemaker_unstandby.yml b/tripleo_ansible/roles/backup_and_restore/tasks/pacemaker_unstandby.yml new file mode 100644 index 000000000..84a5e01de --- /dev/null +++ b/tripleo_ansible/roles/backup_and_restore/tasks/pacemaker_unstandby.yml @@ -0,0 +1,30 @@ +--- +# Copyright 2019 Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# Start again pacemaker +- name: Add the node to the pacemaker cluster + command: pcs node unstandby + tags: + - bar_create_recover_image + +- name: Wait until pacemaker has Galera up&running + shell: ss -tunlp | grep ":3306 " | sed -e 's/.*\///' + register: mysql_result + retries: 300 + until: mysql_result is search('mysqld') + delay: 5 + tags: + - bar_create_recover_image diff --git a/tripleo_ansible/roles/backup_and_restore/tasks/run_backup.yml b/tripleo_ansible/roles/backup_and_restore/tasks/run_backup.yml index ee07830cb..2244b682a 100644 --- a/tripleo_ansible/roles/backup_and_restore/tasks/run_backup.yml +++ b/tripleo_ansible/roles/backup_and_restore/tasks/run_backup.yml @@ -54,26 +54,6 @@ tags: - bar_create_recover_image -- name: Add the node to the pacemaker cluster - command: pcs node unstandby - when: - - pacemaker_enabled - - tripleo_backup_and_restore_enable_snapshots|bool - tags: - - bar_create_recover_image - -- name: Wait until pacemaker has Galera up&running - shell: netstat -tunlp | grep ":3306 " | sed -e 's/.*\///' - register: mysql_result - retries: 10 - until: mysql_result is search('mysqld') - delay: 5 - when: - - pacemaker_enabled - - tripleo_backup_and_restore_enable_snapshots|bool - tags: - - bar_create_recover_image - - name: Clean old backups shell: | set -o pipefail diff --git a/tripleo_ansible/roles/backup_and_restore/tasks/service_manager_pause.yml b/tripleo_ansible/roles/backup_and_restore/tasks/service_manager_pause.yml index 0f27ad476..9383e6b8c 100644 --- a/tripleo_ansible/roles/backup_and_restore/tasks/service_manager_pause.yml +++ b/tripleo_ansible/roles/backup_and_restore/tasks/service_manager_pause.yml @@ -39,6 +39,7 @@ /usr/bin/{{ tripleo_container_cli }} ps --format '{{ '{{' }}.Names {{ '}}' }} ' | /usr/bin/egrep -v 'galera|mysql|bundle' register: container_services changed_when: container_services.stdout is undefined + failed_when: false tags: - bar_create_recover_image