From c35aed1b30055b4b0583f135eb1aa464b487dc51 Mon Sep 17 00:00:00 2001 From: Arnau Verdaguer Date: Wed, 23 Feb 2022 09:59:13 +0100 Subject: [PATCH] Migration revert plan As a failsave the migration code can create a backup of the controllers to use in case that the migration fails and leaves the environment on a unusable state. The revert plan has two stages: 1- Backup stage: included on the current ovn-migration.yml. Can be configured using the env variable CREATE_BACKUP (True by default). This stage will run the new ansible role, recovery-backup. It will store the backup on `/ctl_plane_backup` on the host where the BACKUP_MIGRATION_IP belongs to (can be modified by modifing the env var). In order to restore the controllers, boot them using the iso created by ReaR (stored in /ctl_plane_backup) and perform `automatic recover` 2- Revert stage: this stage has its own ansible playbook (revert.yml) This playbook will clean the environment from all the OVN ressources that could had been created (breaking the data plane connectivity) to leave the environment in a stage where an overcloud deploy with the OVS templates can be run. Note: If the user creates new resources after running the backup stage and then performs the recovery of the controllers, those resources will be lost. Conflicts: tools/ovn_migration/tripleo_environment/ovn_migration.sh Change-Id: I7093f6a5f282b06fb2267cf2c88c533c1eae685d (cherry picked from commit 7003817b697beae2f5effd96a3f71d2c2787e64f) --- doc/source/ovn/migration.rst | 8 +++ .../tripleo_environment/ovn_migration.sh | 35 ++++++++-- .../playbooks/ovn-migration.yml | 17 +++++ .../tripleo_environment/playbooks/revert.yml | 4 ++ .../roles/recovery-backup/defaults/main.yml | 12 ++++ .../roles/recovery-backup/tasks/main.yml | 68 +++++++++++++++++++ .../playbooks/roles/revert/tasks/main.yml | 29 ++++++++ 7 files changed, 168 insertions(+), 5 deletions(-) create mode 100644 tools/ovn_migration/tripleo_environment/playbooks/revert.yml create mode 100644 tools/ovn_migration/tripleo_environment/playbooks/roles/recovery-backup/defaults/main.yml create mode 100644 tools/ovn_migration/tripleo_environment/playbooks/roles/recovery-backup/tasks/main.yml create mode 100644 tools/ovn_migration/tripleo_environment/playbooks/roles/revert/tasks/main.yml diff --git a/doc/source/ovn/migration.rst b/doc/source/ovn/migration.rst index 8aa526d9981..14ad24e89f7 100644 --- a/doc/source/ovn/migration.rst +++ b/doc/source/ovn/migration.rst @@ -147,6 +147,14 @@ Perform the following steps in the undercloud during migration to ensure a synchronized MTU switch across the networks. Default: 30 + * CREATE_BACKUP - Flag to create a backup of the controllers that can be + used as a revert mechanism. + Default: True + + * BACKUP_MIGRATION_IP - Only used if CREATE_BACKUP is enabled, IP of the + server that will be used as a NFS server to store the backup. + Default: 192.168.24.1 + .. warning:: Please note that VALIDATE_MIGRATION requires enough quota (2 diff --git a/tools/ovn_migration/tripleo_environment/ovn_migration.sh b/tools/ovn_migration/tripleo_environment/ovn_migration.sh index bdc426eeebd..0a71b1a7619 100644 --- a/tools/ovn_migration/tripleo_environment/ovn_migration.sh +++ b/tools/ovn_migration/tripleo_environment/ovn_migration.sh @@ -40,6 +40,8 @@ LANG=C : ${SERVER_USER_NAME:=cirros} : ${VALIDATE_MIGRATION:=True} : ${DHCP_RENEWAL_TIME:=30} +: ${CREATE_BACKUP:=True} +: ${BACKUP_MIGRATION_IP:=192.168.24.1} # TODO: Document this new var check_for_necessary_files() { @@ -50,11 +52,14 @@ check_for_necessary_files() { fi # Check if the user has generated overcloud-deploy-ovn.sh file + # With correct permissions # If it is not generated. Exit - if [ ! -e $OVERCLOUD_OVN_DEPLOY_SCRIPT ]; then + if [ ! -x $OVERCLOUD_OVN_DEPLOY_SCRIPT ]; then echo "overcloud deploy migration script :" \ - "$OVERCLOUD_OVN_DEPLOY_SCRIPT is not present. Please" \ - "make sure you create that file before running this script." + "$OVERCLOUD_OVN_DEPLOY_SCRIPT is not present" \ + "or execution permission is missing. Please" \ + "make sure you create that file with correct" \ + "permissions before running this script." exit 1 fi @@ -95,6 +100,17 @@ check_for_necessary_files() { fi exit 1 fi + # Check if backup is enabled + if [[ $CREATE_BACKUP = True ]]; then + # Check if backup server is reachable + ping -c4 $BACKUP_MIGRATION_IP + if [[ $? -eq 1 ]]; then + echo -e "It is not possible to reach the backup migration server IP" \ + "($BACKUP_MIGRATION_IP). Make sure this IP is accessible before" \ + "starting the migration." \ + "Change this value by doing: export BACKUP_MIGRATION_IP=x.x.x.x" + fi + fi } get_host_ip() { @@ -295,13 +311,22 @@ reduce_network_mtu () { start_migration() { source $STACKRC_FILE echo "Starting the Migration" + local inventory_file="$OOO_WORKDIR/$STACK_NAME/config-download/$STACK_NAME/tripleo-ansible-inventory.yaml" + if ! test -f $inventory_file; then + inventory_file='' + fi ansible-playbook -vv $OPT_WORKDIR/playbooks/ovn-migration.yml \ -i hosts_for_migration -e working_dir=$OPT_WORKDIR \ -e public_network_name=$PUBLIC_NETWORK_NAME \ -e image_name=$IMAGE_NAME \ + -e undercloud_node_user=$UNDERCLOUD_NODE_USER \ -e overcloud_ovn_deploy_script=$OVERCLOUD_OVN_DEPLOY_SCRIPT \ - -e server_user_name=$SERVER_USER_NAME \ - -e overcloudrc=$OVERCLOUDRC_FILE \ + -e server_user_name=$SERVER_USER_NAME \ + -e overcloudrc=$OVERCLOUDRC_FILE \ + -e stackrc=$STACKRC_FILE \ + -e backup_migration_ip=$BACKUP_MIGRATION_IP \ + -e create_backup=$CREATE_BACKUP \ + -e ansible_inventory=$inventory_file \ -e validate_migration=$VALIDATE_MIGRATION $* rc=$? diff --git a/tools/ovn_migration/tripleo_environment/playbooks/ovn-migration.yml b/tools/ovn_migration/tripleo_environment/playbooks/ovn-migration.yml index 9dc90a88e2d..68b7b242139 100644 --- a/tools/ovn_migration/tripleo_environment/playbooks/ovn-migration.yml +++ b/tools/ovn_migration/tripleo_environment/playbooks/ovn-migration.yml @@ -1,11 +1,25 @@ # This is the playbook used by ovn-migration.sh. +# +# Backup the controllers to have a backup in case the +# migration fails leaving the testbed on a broken status. +# + +- name: Backup controllers pre-migration + hosts: localhost + roles: + - recovery-backup + tags: + - recovery-backup + + # # Pre migration and validation tasks will make sure that the initial cloud # is functional, and will create resources which will be checked after # migration. # + - name: Pre migration and validation tasks hosts: localhost roles: @@ -48,6 +62,7 @@ - setup become: false + # # Once everything is migrated prepare everything by syncing the neutron DB # into the OVN NB database, and then switching the dataplane to br-int @@ -65,6 +80,7 @@ tags: - migration + # # Verify that the initial resources are still reachable, remove them, # and afterwards create new resources and repeat the connectivity tests. @@ -78,6 +94,7 @@ tags: - post-migration + # # Final validation after tripleo update to br-int # diff --git a/tools/ovn_migration/tripleo_environment/playbooks/revert.yml b/tools/ovn_migration/tripleo_environment/playbooks/revert.yml new file mode 100644 index 00000000000..77f4ca6994f --- /dev/null +++ b/tools/ovn_migration/tripleo_environment/playbooks/revert.yml @@ -0,0 +1,4 @@ +- name: Clean computes + hosts: ovn-controllers + roles: + - revert \ No newline at end of file diff --git a/tools/ovn_migration/tripleo_environment/playbooks/roles/recovery-backup/defaults/main.yml b/tools/ovn_migration/tripleo_environment/playbooks/roles/recovery-backup/defaults/main.yml new file mode 100644 index 00000000000..9d9b43ce2b6 --- /dev/null +++ b/tools/ovn_migration/tripleo_environment/playbooks/roles/recovery-backup/defaults/main.yml @@ -0,0 +1,12 @@ +--- + +# Name of the group hosts where the NFS instalation will take place +# If the NFS server is the undercloud (and there is only one) will +# not be a problem, but if multiple servers exist on the server_name group +# it is possible that the nfs will be installed on every server, eventho the +# storage of the backup will only be done in the backup_ip. +# +# This can be solved if a new tripleo-inventory is manually created specifying +# a [BackupNode] section, with the nfs server info +revert_preparation_server_name: "Undercloud" +backup_and_recover_temp_folder: /tmp/backup-recover-temp diff --git a/tools/ovn_migration/tripleo_environment/playbooks/roles/recovery-backup/tasks/main.yml b/tools/ovn_migration/tripleo_environment/playbooks/roles/recovery-backup/tasks/main.yml new file mode 100644 index 00000000000..6ed510316ae --- /dev/null +++ b/tools/ovn_migration/tripleo_environment/playbooks/roles/recovery-backup/tasks/main.yml @@ -0,0 +1,68 @@ +--- + +- name: Create controller's backup + block: + - name: Create temp folder related to backup + file: + state: directory + path: "{{ backup_and_recover_temp_folder }}" + + # Using this task on OSP17 + - name: Copy tripleo-inventory + copy: + src: "{{ ansible_inventory }}" + dest: "{{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml" + when: + - create_backup|bool + - ansible_inventory is defined + - ansible_inventory != "" + + # Using this task in OSP16.x + - name: Generate tripleo inventory + shell: | + source {{ stackrc }} && + tripleo-ansible-inventory \ + --ansible_ssh_user {{ undercloud_node_user }} \ + --static-yaml-inventory {{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml + when: + - create_backup|bool + - ansible_inventory is not defined or ansible_inventory == "" + + - name: Setup NFS on the backup node using IP {{ backup_migration_ip }} + shell: | + source {{ stackrc }} && + openstack overcloud backup \ + --inventory {{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml \ + --setup-nfs \ + --extra-vars '{ + "tripleo_backup_and_restore_server": {{ backup_migration_ip }}, + "nfs_server_group_name": {{ revert_preparation_server_name }} + }' + + - name: Setup REAR on the controllers + shell: | + source {{ stackrc }} && + openstack overcloud backup \ + --inventory {{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml \ + --setup-rear \ + --extra-vars '{ + "tripleo_backup_and_restore_server": {{ backup_migration_ip }} + }' + + - name: Backup the controllers + shell: | + source {{ stackrc }} && + openstack overcloud backup \ + --inventory {{ backup_and_recover_temp_folder }}/tripleo-inventory.yaml + + # Ensure that after the controller backups the api responds + - name: Ensure that the OSP api is working + shell: > + source {{ overcloudrc }} && openstack flavor list + retries: 20 + register: api_rc + delay: 5 + ignore_errors: yes + until: api_rc.rc == "0" + when: create_backup|bool + diff --git a/tools/ovn_migration/tripleo_environment/playbooks/roles/revert/tasks/main.yml b/tools/ovn_migration/tripleo_environment/playbooks/roles/revert/tasks/main.yml new file mode 100644 index 00000000000..d6b55f14ebf --- /dev/null +++ b/tools/ovn_migration/tripleo_environment/playbooks/roles/revert/tasks/main.yml @@ -0,0 +1,29 @@ +--- +- name: Stop ovn containers + become: yes + shell: | + for agent in $(podman ps -a --format {% raw %}"{{.ID}}"{% endraw %} --filter "name=(ovn_.*|ovnmeta)"); do + echo "Cleaning up agent $agent" + podman rm -f $agent + done + +- name: Clean OVN netns + become: yes + shell: | + for netns in $(ip netns ls | grep ovnmeta | cut -d' ' -f1); do + echo "delete netns $netns" + ip netns del $netns + done + +- name: Delete OVN ports + become: yes + shell: | + for port in $(ovs-vsctl list interface | grep ^name | grep 'ovn-\|patch-provnet\|patch-br-int-to' | cut -d':' -f2); do + echo "Removing port $port" + ovs-vsctl del-port $port + done + +- name: Revert cleanup completed. + debug: + msg: Revert cleanup done, please run overcloud deploy with the OVS configuration. +