Migration revert plan

As a failsave the migration code can create a backup of
the controllers to use in case that the migration fails
and leaves the environment on a unusable state.

The revert plan has two stages:
1- Backup stage: included on the current ovn-migration.yml.
   Can be configured using the env variable CREATE_BACKUP
   (True by default). This stage will run the new ansible
   role, recovery-backup.
   It will store the backup on `/ctl_plane_backup` on the host
   where the BACKUP_MIGRATION_IP belongs to (can be modified by
   modifing the env var).
   In order to restore the controllers, boot them using the iso created
   by ReaR (stored in /ctl_plane_backup) and perform `automatic recover`
2- Revert stage: this stage has its own ansible playbook (revert.yml)
   This playbook will clean the environment from all the OVN ressources
   that could had been created (breaking the data plane connectivity)
   to leave the environment in a stage where an overcloud deploy with
   the OVS templates can be run.

Note: If the user creates new resources after running the backup stage
and then performs the recovery of the controllers, those resources will
be lost.

Conflicts:
    tools/ovn_migration/tripleo_environment/ovn_migration.sh

Change-Id: I7093f6a5f282b06fb2267cf2c88c533c1eae685d
(cherry picked from commit 7003817b69)
This commit is contained in:
Arnau Verdaguer 2022-02-23 09:59:13 +01:00
parent ac27695540
commit c35aed1b30
7 changed files with 168 additions and 5 deletions

View File

@ -147,6 +147,14 @@ Perform the following steps in the undercloud
during migration to ensure a synchronized MTU switch across the networks.
Default: 30
* CREATE_BACKUP - Flag to create a backup of the controllers that can be
used as a revert mechanism.
Default: True
* BACKUP_MIGRATION_IP - Only used if CREATE_BACKUP is enabled, IP of the
server that will be used as a NFS server to store the backup.
Default: 192.168.24.1
.. warning::
Please note that VALIDATE_MIGRATION requires enough quota (2

View File

@ -40,6 +40,8 @@ LANG=C
: ${SERVER_USER_NAME:=cirros}
: ${VALIDATE_MIGRATION:=True}
: ${DHCP_RENEWAL_TIME:=30}
: ${CREATE_BACKUP:=True}
: ${BACKUP_MIGRATION_IP:=192.168.24.1} # TODO: Document this new var
check_for_necessary_files() {
@ -50,11 +52,14 @@ check_for_necessary_files() {
fi
# Check if the user has generated overcloud-deploy-ovn.sh file
# With correct permissions
# If it is not generated. Exit
if [ ! -e $OVERCLOUD_OVN_DEPLOY_SCRIPT ]; then
if [ ! -x $OVERCLOUD_OVN_DEPLOY_SCRIPT ]; then
echo "overcloud deploy migration script :" \
"$OVERCLOUD_OVN_DEPLOY_SCRIPT is not present. Please" \
"make sure you create that file before running this script."
"$OVERCLOUD_OVN_DEPLOY_SCRIPT is not present" \
"or execution permission is missing. Please" \
"make sure you create that file with correct" \
"permissions before running this script."
exit 1
fi
@ -95,6 +100,17 @@ check_for_necessary_files() {
fi
exit 1
fi
# Check if backup is enabled
if [[ $CREATE_BACKUP = True ]]; then
# Check if backup server is reachable
ping -c4 $BACKUP_MIGRATION_IP
if [[ $? -eq 1 ]]; then
echo -e "It is not possible to reach the backup migration server IP" \
"($BACKUP_MIGRATION_IP). Make sure this IP is accessible before" \
"starting the migration." \
"Change this value by doing: export BACKUP_MIGRATION_IP=x.x.x.x"
fi
fi
}
get_host_ip() {
@ -295,13 +311,22 @@ reduce_network_mtu () {
start_migration() {
source $STACKRC_FILE
echo "Starting the Migration"
local inventory_file="$OOO_WORKDIR/$STACK_NAME/config-download/$STACK_NAME/tripleo-ansible-inventory.yaml"
if ! test -f $inventory_file; then
inventory_file=''
fi
ansible-playbook -vv $OPT_WORKDIR/playbooks/ovn-migration.yml \
-i hosts_for_migration -e working_dir=$OPT_WORKDIR \
-e public_network_name=$PUBLIC_NETWORK_NAME \
-e image_name=$IMAGE_NAME \
-e undercloud_node_user=$UNDERCLOUD_NODE_USER \
-e overcloud_ovn_deploy_script=$OVERCLOUD_OVN_DEPLOY_SCRIPT \
-e server_user_name=$SERVER_USER_NAME \
-e overcloudrc=$OVERCLOUDRC_FILE \
-e server_user_name=$SERVER_USER_NAME \
-e overcloudrc=$OVERCLOUDRC_FILE \
-e stackrc=$STACKRC_FILE \
-e backup_migration_ip=$BACKUP_MIGRATION_IP \
-e create_backup=$CREATE_BACKUP \
-e ansible_inventory=$inventory_file \
-e validate_migration=$VALIDATE_MIGRATION $*
rc=$?

View File

@ -1,11 +1,25 @@
# This is the playbook used by ovn-migration.sh.
#
# Backup the controllers to have a backup in case the
# migration fails leaving the testbed on a broken status.
#
- name: Backup controllers pre-migration
hosts: localhost
roles:
- recovery-backup
tags:
- recovery-backup
#
# Pre migration and validation tasks will make sure that the initial cloud
# is functional, and will create resources which will be checked after
# migration.
#
- name: Pre migration and validation tasks
hosts: localhost
roles:
@ -48,6 +62,7 @@
- setup
become: false
#
# Once everything is migrated prepare everything by syncing the neutron DB
# into the OVN NB database, and then switching the dataplane to br-int
@ -65,6 +80,7 @@
tags:
- migration
#
# Verify that the initial resources are still reachable, remove them,
# and afterwards create new resources and repeat the connectivity tests.
@ -78,6 +94,7 @@
tags:
- post-migration
#
# Final validation after tripleo update to br-int
#

View File

@ -0,0 +1,4 @@
- name: Clean computes
hosts: ovn-controllers
roles:
- revert

View File

@ -0,0 +1,12 @@
---
# Name of the group hosts where the NFS instalation will take place
# If the NFS server is the undercloud (and there is only one) will
# not be a problem, but if multiple servers exist on the server_name group
# it is possible that the nfs will be installed on every server, eventho the
# storage of the backup will only be done in the backup_ip.
#
# This can be solved if a new tripleo-inventory is manually created specifying
# a [BackupNode] section, with the nfs server info
revert_preparation_server_name: "Undercloud"
backup_and_recover_temp_folder: /tmp/backup-recover-temp

View File

@ -0,0 +1,68 @@
---
- name: Create controller's backup
block:
- name: Create temp folder related to backup
file:
state: directory
path: "{{ backup_and_recover_temp_folder }}"
# Using this task on OSP17
- name: Copy tripleo-inventory
copy:
src: "{{ ansible_inventory }}"
dest: "{{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml"
when:
- create_backup|bool
- ansible_inventory is defined
- ansible_inventory != ""
# Using this task in OSP16.x
- name: Generate tripleo inventory
shell: |
source {{ stackrc }} &&
tripleo-ansible-inventory \
--ansible_ssh_user {{ undercloud_node_user }} \
--static-yaml-inventory {{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml
when:
- create_backup|bool
- ansible_inventory is not defined or ansible_inventory == ""
- name: Setup NFS on the backup node using IP {{ backup_migration_ip }}
shell: |
source {{ stackrc }} &&
openstack overcloud backup \
--inventory {{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml \
--setup-nfs \
--extra-vars '{
"tripleo_backup_and_restore_server": {{ backup_migration_ip }},
"nfs_server_group_name": {{ revert_preparation_server_name }}
}'
- name: Setup REAR on the controllers
shell: |
source {{ stackrc }} &&
openstack overcloud backup \
--inventory {{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml \
--setup-rear \
--extra-vars '{
"tripleo_backup_and_restore_server": {{ backup_migration_ip }}
}'
- name: Backup the controllers
shell: |
source {{ stackrc }} &&
openstack overcloud backup \
--inventory {{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml
# Ensure that after the controller backups the api responds
- name: Ensure that the OSP api is working
shell: >
source {{ overcloudrc }} && openstack flavor list
retries: 20
register: api_rc
delay: 5
ignore_errors: yes
until: api_rc.rc == "0"
when: create_backup|bool

View File

@ -0,0 +1,29 @@
---
- name: Stop ovn containers
become: yes
shell: |
for agent in $(podman ps -a --format {% raw %}"{{.ID}}"{% endraw %} --filter "name=(ovn_.*|ovnmeta)"); do
echo "Cleaning up agent $agent"
podman rm -f $agent
done
- name: Clean OVN netns
become: yes
shell: |
for netns in $(ip netns ls | grep ovnmeta | cut -d' ' -f1); do
echo "delete netns $netns"
ip netns del $netns
done
- name: Delete OVN ports
become: yes
shell: |
for port in $(ovs-vsctl list interface | grep ^name | grep 'ovn-\|patch-provnet\|patch-br-int-to' | cut -d':' -f2); do
echo "Removing port $port"
ovs-vsctl del-port $port
done
- name: Revert cleanup completed.
debug:
msg: Revert cleanup done, please run overcloud deploy with the OVS configuration.