From 31ccbbb4c0ef43e723c1da65130b1e1c07f6b61d Mon Sep 17 00:00:00 2001
From: Amit Uniyal <auniyal@redhat.com>
Date: Wed, 10 Apr 2024 04:57:37 +0000
Subject: [PATCH] evacuation tests

- Adds evacuation tests
- Adds exception MigrationException
- Adds function to run virsh cmds in host

Depends-On: https://review.opendev.org/c/openstack/tempest/+/919920
Change-Id: Idc0619afedc79945efe001051e86bd80c9136d1e
---
 whitebox_tempest_plugin/api/compute/base.py   |  23 +++
 .../api/compute/test_server_evacuation.py     | 159 ++++++++++++++++++
 .../api/compute/test_vdpa.py                  |  42 ++++-
 .../api/compute/test_vgpu.py                  |  78 +++++++++
 whitebox_tempest_plugin/common/waiters.py     |  20 +++
 whitebox_tempest_plugin/exceptions.py         |   4 +
 6 files changed, 325 insertions(+), 1 deletion(-)
 create mode 100644 whitebox_tempest_plugin/api/compute/test_server_evacuation.py

diff --git a/whitebox_tempest_plugin/api/compute/base.py b/whitebox_tempest_plugin/api/compute/base.py
index 87e93cf4..553956ef 100644
--- a/whitebox_tempest_plugin/api/compute/base.py
+++ b/whitebox_tempest_plugin/api/compute/base.py
@@ -22,6 +22,8 @@ from tempest.common import waiters
 from tempest import config
 from tempest.lib.common.utils import data_utils
 from tempest.lib.common.utils import test_utils
+from time import sleep
+from whitebox_tempest_plugin.common import waiters as wb_waiters
 
 from whitebox_tempest_plugin.services import clients
 
@@ -135,6 +137,21 @@ class BaseWhiteboxComputeTest(base.BaseV2ComputeAdminTest):
         xml = virshxml.dumpxml(server_instance_name)
         return ET.fromstring(xml)
 
+    def shutdown_server_on_host(self, server_id, host):
+        # This runs virsh shutdown commands on host
+        server = self.os_admin.servers_client.show_server(server_id)['server']
+        domain = server['OS-EXT-SRV-ATTR:instance_name']
+
+        ssh_cl = clients.SSHClient(host)
+        cmd = "virsh shutdown %s " % domain
+        ssh_cl.execute(cmd, sudo=True)
+        msg, wait_counter = domain, 0
+        cmd = "virsh list --name"
+        while domain in msg and wait_counter < 6:
+            sleep(10)
+            msg = ssh_cl.execute(cmd, sudo=True)
+            wait_counter += 1
+
     def get_server_blockdevice_path(self, server_id, device_name):
         host = self.get_host_for_server(server_id)
         virshxml = clients.VirshXMLClient(host)
@@ -448,3 +465,9 @@ class BaseWhiteboxComputeTest(base.BaseV2ComputeAdminTest):
         root = self.get_server_xml(server_id)
         huge_pages = root.findall('.memoryBacking/hugepages/page')
         return huge_pages
+
+    def evacuate_server(self, server_id, **kwargs):
+        """Evacuate server and wait for server migration to complete.
+        """
+        self.admin_servers_client.evacuate_server(server_id, **kwargs)
+        wb_waiters.wait_for_server_migration_complete(self.os_admin, server_id)
diff --git a/whitebox_tempest_plugin/api/compute/test_server_evacuation.py b/whitebox_tempest_plugin/api/compute/test_server_evacuation.py
new file mode 100644
index 00000000..17aca0bc
--- /dev/null
+++ b/whitebox_tempest_plugin/api/compute/test_server_evacuation.py
@@ -0,0 +1,159 @@
+#    Copyright 2024 Red Hat
+#    All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from tempest.common import waiters
+from tempest import config
+from whitebox_tempest_plugin.api.compute import base
+from whitebox_tempest_plugin.services import clients
+
+
+CONF = config.CONF
+
+
+class ServerEvacuation(base.BaseWhiteboxComputeTest):
+    '''Test server evacuation.
+    '''
+
+    min_microversion = '2.95'
+
+    @classmethod
+    def skip_checks(cls):
+        super(ServerEvacuation, cls).skip_checks()
+        if CONF.compute.min_compute_nodes < 2:
+            msg = "Need two or more compute nodes to execute evacuate"
+            raise cls.skipException(msg)
+
+    def test_evacuate_to_shutoff(self):
+        server_id = self.create_test_server(wait_until="ACTIVE")['id']
+        host_a = self.get_host_for_server(server_id)
+
+        # set compute service down in host-A
+        host_a_svc = clients.NovaServiceManager(
+            host_a, 'nova-compute', self.os_admin.services_client)
+
+        with host_a_svc.stopped():
+            # as compute service is down at src host,
+            # shutdown server using virsh
+            self.shutdown_server_on_host(server_id, host_a)
+            self.evacuate_server(server_id)
+
+        # after evacuation server stays stopped at destination
+        self.assertNotEqual(self.get_host_for_server(server_id), host_a)
+        server = self.os_admin.servers_client.show_server(server_id)['server']
+        self.assertEqual(server['status'], 'SHUTOFF')
+
+    def test_evacuate_with_target_host(self):
+        server_id = self.create_test_server(wait_until="ACTIVE")['id']
+        host_a = self.get_host_for_server(server_id)
+        host_b = self.get_host_other_than(server_id)
+
+        host_a_svc = clients.NovaServiceManager(
+            host_a, 'nova-compute', self.os_admin.services_client)
+
+        with host_a_svc.stopped():
+            self.shutdown_server_on_host(server_id, host_a)
+            # pass target host
+            self.evacuate_server(server_id, host=host_b)
+
+        self.assertEqual(self.get_host_for_server(server_id), host_b)
+        server = self.os_admin.servers_client.show_server(server_id)['server']
+        self.assertEqual(server['status'], 'SHUTOFF')
+
+    def test_evacuate_attached_vol(self):
+        server = self.create_test_server(wait_until="ACTIVE")
+        server_id = server['id']
+        volume = self.create_volume()
+        vol_id = volume['id']
+
+        # Attach the volume
+        attachment = self.attach_volume(server, volume)
+        waiters.wait_for_volume_resource_status(
+            self.volumes_client, attachment['volumeId'], 'in-use')
+
+        host_a = self.get_host_for_server(server_id)
+
+        server = self.os_admin.servers_client.show_server(server_id)['server']
+        # verify vol if before evacuation
+        self.assertEqual(
+            server['os-extended-volumes:volumes_attached'][0]['id'], vol_id)
+
+        host_a_svc = clients.NovaServiceManager(
+            host_a, 'nova-compute', self.os_admin.services_client)
+
+        with host_a_svc.stopped():
+            self.shutdown_server_on_host(server_id, host_a)
+            self.evacuate_server(server_id)
+
+        self.assertNotEqual(self.get_host_for_server(server_id), host_a)
+        server = self.os_admin.servers_client.show_server(server_id)['server']
+        self.assertEqual(server['status'], 'SHUTOFF')
+        # evacuated VM should have same volume attached to it.
+        self.assertEqual(
+            server['os-extended-volumes:volumes_attached'][0]['id'], vol_id)
+
+    def test_evacuate_bfv_server(self):
+        server = self.create_test_server(
+            volume_backed=True, wait_until="ACTIVE", name="bfv-server")
+        server_id = server['id']
+        host_a = self.get_host_for_server(server_id)
+
+        host_a_svc = clients.NovaServiceManager(
+            host_a, 'nova-compute', self.os_admin.services_client)
+
+        server = self.os_admin.servers_client.show_server(server_id)['server']
+        vol_id = server['os-extended-volumes:volumes_attached'][0]['id']
+
+        with host_a_svc.stopped():
+            self.shutdown_server_on_host(server_id, host_a)
+            self.evacuate_server(server_id)
+
+        self.assertNotEqual(self.get_host_for_server(server_id), host_a)
+        server = self.os_admin.servers_client.show_server(server_id)['server']
+        self.assertEqual(server['status'], 'SHUTOFF')
+        self.assertEqual(
+            server['os-extended-volumes:volumes_attached'][0]['id'], vol_id)
+
+
+class ServerEvacuationV294(base.BaseWhiteboxComputeTest):
+    '''Test server evacuation. or microversion 2.94
+    '''
+
+    min_microversion = '2.94'
+
+    @classmethod
+    def skip_checks(cls):
+        super(ServerEvacuationV294, cls).skip_checks()
+        if CONF.compute.min_compute_nodes < 2:
+            msg = "Need two or more compute nodes to execute evacuate"
+            raise cls.skipException(msg)
+
+    def test_evacuate_to_active(self):
+        server_id = self.create_test_server(wait_until="ACTIVE")['id']
+        host_a = self.get_host_for_server(server_id)
+
+        # set compute service down in host-A
+        host_a_svc = clients.NovaServiceManager(
+            host_a, 'nova-compute', self.os_admin.services_client)
+
+        with host_a_svc.stopped():
+            # as compute service is down at src host,
+            # shutdown server using virsh
+            self.shutdown_server_on_host(server_id, host_a)
+            self.evacuate_server(server_id)
+
+        # after evacuation server starts by itself at destination
+        self.assertNotEqual(self.get_host_for_server(server_id), host_a)
+        server = self.os_admin.servers_client.show_server(server_id)['server']
+        self.assertEqual(server['status'], 'ACTIVE')
diff --git a/whitebox_tempest_plugin/api/compute/test_vdpa.py b/whitebox_tempest_plugin/api/compute/test_vdpa.py
index 700dc6e7..44fd0ac8 100644
--- a/whitebox_tempest_plugin/api/compute/test_vdpa.py
+++ b/whitebox_tempest_plugin/api/compute/test_vdpa.py
@@ -15,8 +15,8 @@
 
 from tempest.common import waiters
 from tempest import config
-
 from whitebox_tempest_plugin.api.compute import base
+from whitebox_tempest_plugin.services import clients
 
 from oslo_log import log as logging
 
@@ -208,3 +208,43 @@ class VDPAResizeInstance(VDPASmokeTests):
             server['id'],
             port['port']['id']
         )
+
+
+class VDPAEvacuateInstance(VDPASmokeTests):
+
+    min_microversion = '2.95'
+
+    @classmethod
+    def skip_checks(cls):
+        super(VDPAEvacuateInstance, cls).skip_checks()
+        if CONF.compute.min_compute_nodes < 2:
+            msg = "Need two or more compute nodes to execute evacuate."
+            raise cls.skipException(msg)
+
+    def test_evacuate_server_vdpa(self):
+        # Create an instance with a vDPA port and evacuate
+        port = self._create_port_from_vnic_type(
+            net=self.network,
+            vnic_type='vdpa'
+        )
+        server = self.create_test_server(
+            networks=[{'port': port['port']['id']}],
+            wait_until='ACTIVE'
+        )
+        server_id = server['id']
+        host_a = self.get_host_for_server(server_id)
+
+        host_a_svc = clients.NovaServiceManager(
+            host_a, 'nova-compute', self.os_admin.services_client)
+
+        with host_a_svc.stopped():
+            self.shutdown_server_on_host(server_id, host_a)
+            self.evacuate_server(server_id)
+
+        self.assertNotEqual(self.get_host_for_server(server_id), host_a)
+        # Confirm dev_type, allocation status, and pci address information are
+        # correct in pci_devices table of openstack DB
+        self._verify_neutron_port_binding(
+            server_id,
+            port['port']['id']
+        )
diff --git a/whitebox_tempest_plugin/api/compute/test_vgpu.py b/whitebox_tempest_plugin/api/compute/test_vgpu.py
index 25bcae0f..8835dd4e 100644
--- a/whitebox_tempest_plugin/api/compute/test_vgpu.py
+++ b/whitebox_tempest_plugin/api/compute/test_vgpu.py
@@ -17,6 +17,7 @@ from tempest.common.utils.linux import remote_client
 from tempest.common import waiters
 from tempest import config
 from tempest.lib.common.utils import data_utils
+from tempest.lib import decorators
 from whitebox_tempest_plugin.api.compute import base
 from whitebox_tempest_plugin.services import clients
 
@@ -609,3 +610,80 @@ class VGPUMultiTypes(VGPUTest):
                 expected_mdev_type, found_mdev_type,
                 "The found mdev_type %s does not match the expected mdev_type "
                 "%s for %s" % (found_mdev_type, expected_mdev_type, trait))
+
+
+@decorators.skip_because(bug='1874664')
+class VGPUServerEvacuation(VGPUTest):
+
+    min_microversion = '2.95'
+
+    @classmethod
+    def skip_checks(cls):
+        super(VGPUServerEvacuation, cls).skip_checks()
+        if CONF.compute.min_compute_nodes < 2:
+            msg = "Need two or more compute nodes to execute evacuate."
+            raise cls.skipException(msg)
+
+    def test_evacuate_server_having_vgpu(self):
+        starting_rp_vgpu_inventory = \
+            self._get_vgpu_inventories_for_deployment()
+        validation_resources = self.get_test_validation_resources(
+            self.os_primary)
+        server = self.create_validateable_instance(
+            self.vgpu_flavor, validation_resources)
+        linux_client = self._create_ssh_client(server, validation_resources)
+
+        # Determine the host the vGPU enabled guest is currently on. Next
+        # get another potential compute host to serve as the migration target
+        src_host = self.get_host_for_server(server['id'])
+        dest_host = self.get_host_other_than(server['id'])
+
+        # Get the current VGPU usage from the resource providers on
+        # the source and destination compute hosts.
+        pre_src_usage = self._get_vgpu_util_for_host(src_host)
+        pre_dest_usage = self._get_vgpu_util_for_host(dest_host)
+
+        host_a_svc = clients.NovaServiceManager(
+            src_host, 'nova-compute', self.os_admin.services_client)
+
+        with host_a_svc.stopped():
+            self.shutdown_server_on_host(server['id'], src_host)
+            self.evacuate_server(server['id'])
+
+        self.assertEqual(self.get_host_for_server(server['id']), dest_host)
+
+        LOG.info('Guest %(server)s was just evacuated to %(dest_host)s, '
+                 'guest will now be validated after operation',
+                 {'server': server['id'], 'dest_host': dest_host})
+        self._validate_vgpu_instance(
+            server,
+            linux_client=linux_client,
+            expected_device_count=self.vgpu_amount_per_instance)
+
+        # Regather the VGPU resource usage on both compute hosts involved.
+        # Confirm the original source host's VGPU usage has
+        # updated to no longer report original usage for the vGPU resource and
+        # the destination is now accounting for the resource.
+        post_src_usage = self._get_vgpu_util_for_host(src_host)
+        post_dest_usage = self._get_vgpu_util_for_host(dest_host)
+        expected_src_usage = pre_src_usage - self.vgpu_amount_per_instance
+        self.assertEqual(
+            expected_src_usage,
+            post_src_usage, 'After evacuation, host %s expected to have %s '
+            'usage for resource class VGPU but instead found %d' %
+            (src_host, expected_src_usage, post_src_usage))
+        expected_dest_usage = pre_dest_usage + self.vgpu_amount_per_instance
+        self.assertEqual(
+            expected_dest_usage, post_dest_usage, 'After evacuation, Host '
+            '%s expected to have resource class VGPU usage totaling %d but '
+            'instead found %d' %
+            (dest_host, expected_dest_usage, post_dest_usage))
+
+        # Delete the guest and confirm the inventory reverts back to the
+        # original amount
+        self.os_admin.servers_client.delete_server(server['id'])
+        waiters.wait_for_server_termination(
+            self.os_admin.servers_client, server['id'])
+        end_rp_vgpu_inventory = self._get_vgpu_inventories_for_deployment()
+        self._validate_final_vgpu_rp_inventory(
+            starting_rp_vgpu_inventory, end_rp_vgpu_inventory)
diff --git a/whitebox_tempest_plugin/common/waiters.py b/whitebox_tempest_plugin/common/waiters.py
index ac216138..017ea787 100644
--- a/whitebox_tempest_plugin/common/waiters.py
+++ b/whitebox_tempest_plugin/common/waiters.py
@@ -16,6 +16,7 @@
 import time
 
 from tempest.lib import exceptions as lib_exc
+from whitebox_tempest_plugin.exceptions import MigrationException
 
 
 def wait_for_nova_service_state(client, host, binary, state):
@@ -33,3 +34,22 @@ def wait_for_nova_service_state(client, host, binary, state):
                 'Service %s on host %s failed to reach state %s within '
                 'the required time (%s s)', binary, host, state, timeout)
         service = client.list_services(host=host, binary=binary)['services'][0]
+
+
+def wait_for_server_migration_complete(os_admin, server_id):
+    start_time = int(time.time())
+    timeout = os_admin.services_client.build_timeout
+    interval = os_admin.services_client.build_interval + 1
+    while int(time.time()) - start_time <= timeout:
+        s_migs = os_admin.migrations_client.list_migrations()
+        if s_migs['migrations'][-1]['status'] in ['done', 'completed']:
+            break
+        elif s_migs['migrations'][-1]['status'] in ['error', 'failed']:
+            raise MigrationException(
+                'Evacuation failed, because server migration failed.')
+        time.sleep(interval)
+    else:
+        # raise Timeout exception if migration never completed
+        raise lib_exc.TimeoutException(
+            'Evacuation failed, because server migration did not '
+            'complete, within the required time: (%s s)' % timeout)
diff --git a/whitebox_tempest_plugin/exceptions.py b/whitebox_tempest_plugin/exceptions.py
index c54d6b2a..1991796d 100644
--- a/whitebox_tempest_plugin/exceptions.py
+++ b/whitebox_tempest_plugin/exceptions.py
@@ -26,3 +26,7 @@ class MissingServiceSectionException(exceptions.TempestException):
 
 class InvalidCPUSpec(exceptions.TempestException):
     message = "CPU spec is invalid: %(spec)s."
+
+
+class MigrationException(exceptions.TempestException):
+    message = "Migration Failed: %(msg)s."