Backup&Restore Snapshots RFE

The backup&restore process on the controller needs to be done with
the controller stopped. If that condition is not met, we risk
introducing race conditions that could create unrestorable
non-consistent backups. Currently, the B&R process stops the three
controllers to perform a backup, leaving the cloud unmanageable
during the duration of the backup, which can take several minutes.

This change introduces the capability of doing "serial" backups.
Managing Pacemaker lets us take a node out of the controller
Pacemaker cluster, take a backup from it and introduce it again
on the cluster. While that backup is being done, the other two
nodes of the controller cluster continue to be available to
manage the cloud.

Bugzilla: #1877798
Change-Id: I77c4e6accc205cbc962c663babe1018b51d8d266
This commit is contained in:
Juan Larriba 2021-04-15 11:23:25 +02:00
parent 4cf8112b26
commit ebcfee5f20
5 changed files with 55 additions and 136 deletions

View File

@ -90,22 +90,3 @@
tags:
- bar_create_recover_image
no_log: "{{ tripleo_backup_and_restore_hide_sensitive_logs | bool }}"
- name: Pause mysql.
command: "{{ tripleo_container_cli }} pause {{ tripleo_backup_and_restore_mysql_container }}"
when:
- mysql_password.stderr is defined
- tripleo_backup_and_restore_mysql_container == "mysql"
- not enabled_galera
- tripleo_backup_and_restore_service_manager|bool
tags:
- bar_create_recover_image
- name: Stop pacemaker
command: pcs cluster stop --all
when:
- enabled_galera
- tripleo_backup_and_restore_service_manager|bool
run_once: true
tags:
- bar_create_recover_image

View File

@ -38,21 +38,15 @@
- name: Setup ReaR
import_tasks: setup_rear.yml
- name: Service management
import_tasks: service_manager_pause.yml
when:
- tripleo_backup_and_restore_service_manager
- name: Do Backup
block:
- name: Backup the database
import_tasks: db_backup.yml
- name: Backup the database
import_tasks: db_backup.yml
- name: Backup pacemaker configuration
import_tasks: pacemaker_backup.yml
- name: Backup pacemaker configuration
import_tasks: pacemaker_backup.yml
- name: Create recovery images with ReaR
import_tasks: run_backup.yml
- name: Service management
import_tasks: service_manager_unpause.yml
when:
- tripleo_backup_and_restore_service_manager
- name: Create recovery images with ReaR
import_tasks: run_backup.yml
tags:
- bar_create_recover_image

View File

@ -32,6 +32,35 @@
tags:
- always
- name: Move virtual IPs to another node before stopping pacemaker
when: pacemaker_enabled
shell: |
CLUSTER_NODE=$(crm_node -n)
echo "Retrieving all the VIPs which are hosted on this node"
VIPS_TO_MOVE=$(crm_mon --as-xml | xmllint --xpath '//resource[@resource_agent = "ocf::heartbeat:IPaddr2" and @role = "Started" and @managed = "true" and ./node[@name = "'${CLUSTER_NODE}'"]]/@id' - | sed -e 's/id=//g' -e 's/"//g')
for v in ${VIPS_TO_MOVE}; do
echo "Moving VIP $v on another node"
pcs resource ban $v ${CLUSTER_NODE} --wait=300
done
echo "Removing the location constraints that were created to move the VIPs"
for v in ${VIPS_TO_MOVE}; do
echo "Removing location ban for VIP $v"
ban_id=$(cibadmin --query | xmllint --xpath 'string(//rsc_location[@rsc="'${v}'" and @node="'${CLUSTER_NODE}'" and @score="-INFINITY"]/@id)' -)
if [ -n "$ban_id" ]; then
pcs constraint remove ${ban_id}
else
echo "Could not retrieve and clear location constraint for VIP $v" 2>&1
fi
done
tags:
- bar_create_recover_image
- name: Take this node out of pacemaker
command: pcs node standby
when: pacemaker_enabled
tags:
- bar_create_recover_image
- name: Create the node backup
become: true
command: rear {{ '-s ' if tripleo_backup_and_restore_rear_simulate else '' }}-d -v mkbackup
@ -45,3 +74,19 @@
var: tripleo_backup_and_restore_rear_output
tags:
- bar_create_recover_image
- name: Add the node to the pacemaker cluster
command: pcs node unstandby
when: pacemaker_enabled
tags:
- bar_create_recover_image
- name: Wait until pacemaker has Galera up&running
shell: netstat -tunlp | grep ":3306 " | sed -e 's/.*\///'
register: mysql_result
retries: 10
until: mysql_result is search('mysqld')
delay: 5
when: pacemaker_enabled
tags:
- bar_create_recover_image

View File

@ -1,50 +0,0 @@
---
# Copyright 2019 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# Call to podman to list running containers then commit all state to
# disk. Once services state has been flushed dump the database then allow
# the backup to start.
- name: Get Container cli
command: hiera -c /etc/puppet/hiera.yaml container_cli
register: tripleo_backup_and_restore_container_cli
changed_when: tripleo_backup_and_restore_container_cli.stdout is undefined
tags:
- bar_create_recover_image
- name: set tripleo_container_cli
set_fact:
tripleo_container_cli: "{{ tripleo_backup_and_restore_container_cli.stdout }}"
when:
- tripleo_backup_and_restore_container_cli.stdout != 'nil'
tags:
- bar_create_recover_image
- name: Gather Container Service Name
shell: |
set -o pipefail
/usr/bin/{{ tripleo_container_cli }} ps --format '{{ '{{' }}.Names {{ '}}' }} ' | /usr/bin/egrep -v 'galera|mysql|bundle'
register: container_services
changed_when: container_services.stdout is undefined
tags:
- bar_create_recover_image
- name: Pause containers for database backup.
command: "{{ tripleo_container_cli }} pause {{ item }}"
with_items: "{{ container_services.stdout_lines }}"
when: container_services is defined
tags:
- bar_create_recover_image

View File

@ -1,51 +0,0 @@
---
# Copyright 2019 Red Hat, Inc.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# Call to podman to list running containers then commit all state to
# disk. Once services state has been flushed dump the database then allow
# the backup to start.
- name: Enable pacemaker
command: pcs cluster start --all
when: enabled_galera
run_once: true
tags:
- bar_create_recover_image
- name: unPause database container
command: "{{ tripleo_container_cli }} unpause {{ tripleo_backup_and_restore_mysql_container }}"
when:
- tripleo_container_cli is defined
- not enabled_galera
- tripleo_backup_and_restore_mysql_container is defined
tags:
- bar_create_recover_image
- name: Gather Container Service Name
shell: |
set -o pipefail
/usr/bin/{{ tripleo_container_cli }} ps -a --filter='status=paused' --format '{{ '{{' }}.Names {{ '}}' }} '
register: container_services
changed_when: container_services.stdout is defined
tags:
- bar_create_recover_image
- name: unPause containers
command: "{{ tripleo_container_cli }} unpause {{ item }}"
with_items: "{{ container_services.stdout_lines }}"
when: tripleo_container_cli is defined
tags:
- bar_create_recover_image