Wait for libvirt domain shutdown correctly

Previousy, when shutting down a libvirt domain before evacuation, we
would wait until it disappeared from the list of libvirt domains, but
we just merrily continued on even if it was still present after 60
seconds. I suspect the domain remained present, and our next steps
were to evacuate the instance and start nova-compute back up again, at
which point it tried to delete evacuated instances, and fail with the
following:

File "/opt/stack/nova/nova/compute/manager.py", line 839, in _destroy_evacuated_instances
<snip>
libvirt.libvirtError: Failed to terminate process 55183 with SIGKILL: Device or resource busy

This patch makes us wait for the domain to disappear, and raise an
error if it doesn't disappear in time. It also refactors the shutdown
code slightly to avoid an extra API quey to get the server.

Change-Id: I76885987842440f9b690c198e6b8149bdd85f899
This commit is contained in:
Artom Lifshitz 2024-10-25 14:28:58 -04:00 committed by jamepark4
parent bf5f2e573f
commit bee34dbb86
4 changed files with 50 additions and 38 deletions

View File

@ -14,6 +14,7 @@
# under the License.
import six
import time
import xml.etree.ElementTree as ET
from oslo_log import log as logging
@ -22,9 +23,9 @@ from tempest.common import waiters
from tempest import config
from tempest.lib.common.utils import data_utils
from tempest.lib.common.utils import test_utils
from time import sleep
from whitebox_tempest_plugin.common import waiters as wb_waiters
from tempest.lib import exceptions as lib_exc
from whitebox_tempest_plugin.common import waiters as wb_waiters
from whitebox_tempest_plugin.services import clients
if six.PY2:
@ -132,20 +133,28 @@ class BaseWhiteboxComputeTest(base.BaseV2ComputeAdminTest):
xml = virshxml.dumpxml(server_instance_name)
return ET.fromstring(xml)
def shutdown_server_on_host(self, server_id, host):
# This runs virsh shutdown commands on host
server = self.os_admin.servers_client.show_server(server_id)['server']
domain = server['OS-EXT-SRV-ATTR:instance_name']
def shutdown_server_domain(self, server, host):
server_details = \
self.admin_servers_client.show_server(server['id'])['server']
domain_name = server_details['OS-EXT-SRV-ATTR:instance_name']
ssh_client = clients.SSHClient(host)
ssh_client.execute('virsh shutdown %s' % domain_name, sudo=True)
self._wait_for_domain_shutdown(ssh_client, domain_name)
ssh_cl = clients.SSHClient(host)
cmd = "virsh shutdown %s " % domain
ssh_cl.execute(cmd, sudo=True)
msg, wait_counter = domain, 0
cmd = "virsh list --name"
while domain in msg and wait_counter < 6:
sleep(10)
msg = ssh_cl.execute(cmd, sudo=True)
wait_counter += 1
def _wait_for_domain_shutdown(self, ssh_client, domain_name):
start_time = int(time.time())
timeout = CONF.compute.build_timeout
interval = CONF.compute.build_interval
while int(time.time()) - start_time <= timeout:
domains = ssh_client.execute('virsh list --name', sudo=True)
if domain_name in domains:
continue
else:
break
time.sleep(interval)
else:
raise lib_exc.TimeoutException(
'Failed to shutdown domain within the required time.')
def get_server_blockdevice_path(self, server_id, device_name):
host = self.get_host_for_server(server_id)

View File

@ -36,8 +36,8 @@ class ServerEvacuation(base.BaseWhiteboxComputeTest):
raise cls.skipException(msg)
def test_evacuate_to_shutoff(self):
server_id = self.create_test_server(wait_until="ACTIVE")['id']
host_a = self.get_host_for_server(server_id)
server = self.create_test_server(wait_until="ACTIVE")
host_a = self.get_host_for_server(server['id'])
# set compute service down in host-A
host_a_svc = clients.NovaServiceManager(
@ -46,29 +46,31 @@ class ServerEvacuation(base.BaseWhiteboxComputeTest):
with host_a_svc.stopped():
# as compute service is down at src host,
# shutdown server using virsh
self.shutdown_server_on_host(server_id, host_a)
self.evacuate_server(server_id)
self.shutdown_server_domain(server, host_a)
self.evacuate_server(server['id'])
# after evacuation server stays stopped at destination
self.assertNotEqual(self.get_host_for_server(server_id), host_a)
server = self.os_admin.servers_client.show_server(server_id)['server']
self.assertNotEqual(self.get_host_for_server(server['id']), host_a)
server = self.os_admin.servers_client.show_server(
server['id'])['server']
self.assertEqual(server['status'], 'SHUTOFF')
def test_evacuate_with_target_host(self):
server_id = self.create_test_server(wait_until="ACTIVE")['id']
host_a = self.get_host_for_server(server_id)
host_b = self.get_host_other_than(server_id)
server = self.create_test_server(wait_until="ACTIVE")
host_a = self.get_host_for_server(server['id'])
host_b = self.get_host_other_than(server['id'])
host_a_svc = clients.NovaServiceManager(
host_a, 'nova-compute', self.os_admin.services_client)
with host_a_svc.stopped():
self.shutdown_server_on_host(server_id, host_a)
self.shutdown_server_domain(server, host_a)
# pass target host
self.evacuate_server(server_id, host=host_b)
self.evacuate_server(server['id'], host=host_b)
self.assertEqual(self.get_host_for_server(server_id), host_b)
server = self.os_admin.servers_client.show_server(server_id)['server']
self.assertEqual(self.get_host_for_server(server['id']), host_b)
server = self.os_admin.servers_client.show_server(
server['id'])['server']
self.assertEqual(server['status'], 'SHUTOFF')
def test_evacuate_attached_vol(self):
@ -93,7 +95,7 @@ class ServerEvacuation(base.BaseWhiteboxComputeTest):
host_a, 'nova-compute', self.os_admin.services_client)
with host_a_svc.stopped():
self.shutdown_server_on_host(server_id, host_a)
self.shutdown_server_domain(server, host_a)
self.evacuate_server(server_id)
self.assertNotEqual(self.get_host_for_server(server_id), host_a)
@ -116,7 +118,7 @@ class ServerEvacuation(base.BaseWhiteboxComputeTest):
vol_id = server['os-extended-volumes:volumes_attached'][0]['id']
with host_a_svc.stopped():
self.shutdown_server_on_host(server_id, host_a)
self.shutdown_server_domain(server, host_a)
self.evacuate_server(server_id)
self.assertNotEqual(self.get_host_for_server(server_id), host_a)
@ -140,8 +142,8 @@ class ServerEvacuationV294(base.BaseWhiteboxComputeTest):
raise cls.skipException(msg)
def test_evacuate_to_active(self):
server_id = self.create_test_server(wait_until="ACTIVE")['id']
host_a = self.get_host_for_server(server_id)
server = self.create_test_server(wait_until="ACTIVE")
host_a = self.get_host_for_server(server['id'])
# set compute service down in host-A
host_a_svc = clients.NovaServiceManager(
@ -150,10 +152,11 @@ class ServerEvacuationV294(base.BaseWhiteboxComputeTest):
with host_a_svc.stopped():
# as compute service is down at src host,
# shutdown server using virsh
self.shutdown_server_on_host(server_id, host_a)
self.evacuate_server(server_id)
self.shutdown_server_domain(server, host_a)
self.evacuate_server(server['id'])
# after evacuation server starts by itself at destination
self.assertNotEqual(self.get_host_for_server(server_id), host_a)
server = self.os_admin.servers_client.show_server(server_id)['server']
self.assertNotEqual(self.get_host_for_server(server['id']), host_a)
server = self.os_admin.servers_client.show_server(
server['id'])['server']
self.assertEqual(server['status'], 'ACTIVE')

View File

@ -238,7 +238,7 @@ class VDPAEvacuateInstance(VDPASmokeTests):
host_a, 'nova-compute', self.os_admin.services_client)
with host_a_svc.stopped():
self.shutdown_server_on_host(server_id, host_a)
self.shutdown_server_domain(server, host_a)
self.evacuate_server(server_id)
self.assertNotEqual(self.get_host_for_server(server_id), host_a)

View File

@ -647,7 +647,7 @@ class VGPUServerEvacuation(VGPUTest):
src_host, 'nova-compute', self.os_admin.services_client)
with host_a_svc.stopped():
self.shutdown_server_on_host(server['id'], src_host)
self.shutdown_server_domain(server, src_host)
self.evacuate_server(server['id'])
self.assertEqual(self.get_host_for_server(server['id']), dest_host)