From 2c612c5216d161061880a758ad59bb2f08cd76e3 Mon Sep 17 00:00:00 2001 From: Juan Larriba Date: Fri, 14 May 2021 14:31:46 +0200 Subject: [PATCH] Make execution of the backup idempotent Currently, the playbook stops the services and the pacemaker to perform the backup. If the backup fails for whatever reason, the playbook cannot run again and fails due to the services being down. This patch fixes the issue in two directions. First, by using the block-always ansible feature, we ensure that whenever an error happens, the playbook will restart the pacemaker and the services before exiting. The second direction is on the execution itself. By controlling certain actions, we ensure that the backup will be succesful not matter what state the services and the pacemaker are when the playbook is ran. This ensures success even in the case an operator stops the ansible execution manually (which will not trigger the 'always' condition). This is not a clean cherry pick due to conflicts during merge, mostly related to the location of the role being backup-and-restore instead of backup_and_restore. BZ: #1954818 Change-Id: Id2aff61f219b0c4992f6f0045f1aba2c7d129758 (cherry picked from commit 9a865d176987f82ecafe121f21ba8db9641379d2) --- .../backup/tasks/db_backup.yml | 34 +++++++++++++++++++ .../backup/tasks/service_manager_pause.yml | 1 + .../tasks/ceph_authentication.yml | 6 ++++ .../roles/backup-and-restore/tasks/main.yml | 16 +++++---- 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/tripleo_ansible/roles/backup-and-restore/backup/tasks/db_backup.yml b/tripleo_ansible/roles/backup-and-restore/backup/tasks/db_backup.yml index a20b9d970..3fc5259c1 100644 --- a/tripleo_ansible/roles/backup-and-restore/backup/tasks/db_backup.yml +++ b/tripleo_ansible/roles/backup-and-restore/backup/tasks/db_backup.yml @@ -48,6 +48,29 @@ tags: - bar_create_recover_image +- name: Enable pacemaker if it is stopped + command: pcs cluster start --all + when: + - enabled_galera + - tripleo_backup_and_restore_service_manager|bool + run_once: true + tags: + - bar_create_recover_image + +- name: Wait until pacemaker has Galera up&running + shell: | + set -o pipefail + netstat -tunlp | grep ":3306 " | sed -e 's/.*\///' + register: mysql_result + retries: 300 + until: mysql_result is search('mysqld') + delay: 5 + when: + - enabled_galera + - tripleo_backup_and_restore_service_manager|bool + tags: + - bar_create_recover_image + - name: Get the mysql container id when galera is enabled shell: | set -o pipefail @@ -65,6 +88,17 @@ tags: - bar_create_recover_image +- name: Unpause mysql for backup if it is paused + command: "{{ tripleo_container_cli }} unpause {{ tripleo_backup_and_restore_mysql_container }}" + when: + - mysql_password.stderr is defined + - tripleo_backup_and_restore_mysql_container == "mysql" + - not enabled_galera + - tripleo_backup_and_restore_service_manager|bool + failed_when: false + tags: + - bar_create_recover_image + - name: MySQL Grants backup shell: | set -o pipefail diff --git a/tripleo_ansible/roles/backup-and-restore/backup/tasks/service_manager_pause.yml b/tripleo_ansible/roles/backup-and-restore/backup/tasks/service_manager_pause.yml index c39084f90..bb56ccc8c 100644 --- a/tripleo_ansible/roles/backup-and-restore/backup/tasks/service_manager_pause.yml +++ b/tripleo_ansible/roles/backup-and-restore/backup/tasks/service_manager_pause.yml @@ -39,6 +39,7 @@ /usr/bin/{{ tripleo_container_cli }} ps --format '{{ '{{' }}.Names {{ '}}' }} ' | /usr/bin/egrep -v 'galera|mysql|bundle' register: container_services changed_when: container_services.stdout is undefined + failed_when: false tags: - bar_create_recover_image diff --git a/tripleo_ansible/roles/backup-and-restore/tasks/ceph_authentication.yml b/tripleo_ansible/roles/backup-and-restore/tasks/ceph_authentication.yml index 05f11e997..5e082abb1 100644 --- a/tripleo_ansible/roles/backup-and-restore/tasks/ceph_authentication.yml +++ b/tripleo_ansible/roles/backup-and-restore/tasks/ceph_authentication.yml @@ -32,6 +32,12 @@ tags: - always +- name: Unpause ceph mon container if paused + command: "{{ tripleo_container_cli }} unpause ceph-mon-{{ ansible_facts['hostname'] }}" + failed_when: false + tags: + - bar_create_recover_image + - name: Export ceph authentication shell: | set -o pipefail diff --git a/tripleo_ansible/roles/backup-and-restore/tasks/main.yml b/tripleo_ansible/roles/backup-and-restore/tasks/main.yml index 269487489..837be0a86 100644 --- a/tripleo_ansible/roles/backup-and-restore/tasks/main.yml +++ b/tripleo_ansible/roles/backup-and-restore/tasks/main.yml @@ -49,10 +49,12 @@ - name: Backup pacemaker configuration import_tasks: pacemaker_backup.yml -- name: Create recovery images with ReaR - import_tasks: ../backup/tasks/main.yml - -- name: Service management - import_tasks: ../backup/tasks/service_manager_unpause.yml - when: - - tripleo_backup_and_restore_service_manager +- name: Perform backup + block: + - name: Create recovery images with ReaR + import_tasks: ../backup/tasks/main.yml + always: + - name: Service management + import_tasks: ../backup/tasks/service_manager_unpause.yml + when: + - tripleo_backup_and_restore_service_manager