Merge "libvirt: Rework 'EBUSY' (SIGKILL) error handling code path"

This commit is contained in:
Zuul 2019-05-31 15:41:26 +00:00 committed by Gerrit Code Review
commit 0793b4f641
2 changed files with 67 additions and 13 deletions

View File

@ -14882,7 +14882,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
instance)
def test_private_destroy_ebusy_timeout(self):
# Tests that _destroy will retry 3 times to destroy the guest when an
# Tests that _destroy will retry 6 times to destroy the guest when an
# EBUSY is raised, but eventually times out and raises the libvirtError
ex = fakelibvirt.make_libvirtError(
fakelibvirt.libvirtError,
@ -14902,7 +14902,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self.assertRaises(fakelibvirt.libvirtError, drvr._destroy,
instance)
self.assertEqual(3, mock_guest.poweroff.call_count)
self.assertEqual(6, mock_guest.poweroff.call_count)
def test_private_destroy_ebusy_multiple_attempt_ok(self):
# Tests that the _destroy attempt loop is broken when EBUSY is no
@ -14928,6 +14928,37 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self.assertEqual(2, mock_guest.poweroff.call_count)
@mock.patch.object(libvirt_driver.LOG, 'warning')
@mock.patch.object(fakelibvirt.Connection, 'getLibVersion',
return_value=versionutils.convert_version_to_int(
libvirt_driver.MIN_LIBVIRT_BETTER_SIGKILL_HANDLING))
def test_min_libvirt_better_sigkill_handling_warning(self,
mock_warning,
mock_get_libversion):
ex = fakelibvirt.make_libvirtError(
fakelibvirt.libvirtError,
("Failed to terminate process 26425 with SIGKILL: "
"Device or resource busy"),
error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
int1=errno.EBUSY)
mock_guest = mock.Mock(libvirt_guest.Guest, id=1)
mock_guest.poweroff = mock.Mock(side_effect=ex)
instance = objects.Instance(**self.test_instance)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
with mock.patch.object(drvr._host, 'get_guest',
return_value=mock_guest):
raised = self.assertRaises(fakelibvirt.libvirtError,
drvr._destroy,
instance)
self.assertEqual(fakelibvirt.VIR_ERR_SYSTEM_ERROR,
raised.get_error_code())
mock_warning.assert_called_once()
mock_guest.poweroff.assert_called_once()
@mock.patch.object(fakelibvirt.libvirtError, 'get_error_code')
@mock.patch.object(host.Host, '_get_domain',
side_effect=exception.InstanceNotFound(

View File

@ -282,6 +282,11 @@ MIN_QEMU_FILE_BACKED_DISCARD_VERSION = (2, 10, 0)
MIN_LIBVIRT_NATIVE_TLS_VERSION = (4, 4, 0)
MIN_QEMU_NATIVE_TLS_VERSION = (2, 11, 0)
# If the host has this libvirt version, then we skip the retry loop of
# instance destroy() call, as libvirt itself increased the wait time
# before the SIGKILL signal takes effect.
MIN_LIBVIRT_BETTER_SIGKILL_HANDLING = (4, 7, 0)
VGPU_RESOURCE_SEMAPHORE = "vgpu_resources"
@ -1005,18 +1010,36 @@ class LibvirtDriver(driver.ComputeDriver):
# steal time from the cloud host. ie 15 wallclock
# seconds may have passed, but the VM might have only
# have a few seconds of scheduled run time.
LOG.warning('Error from libvirt during destroy. '
'Code=%(errcode)s Error=%(e)s; '
'attempt %(attempt)d of 3',
{'errcode': errcode, 'e': e,
'attempt': attempt},
instance=instance)
#
# TODO(kchamart): Once MIN_LIBVIRT_VERSION
# reaches v4.7.0, (a) rewrite the above note,
# and (b) remove the following code that retries
# _destroy() API call (which gives SIGKILL 30
# seconds to take effect) -- because from v4.7.0
# onwards, libvirt _automatically_ increases the
# timeout to 30 seconds. This was added in the
# following libvirt commits:
#
# - 9a4e4b942 (process: wait longer 5->30s on
# hard shutdown)
#
# - be2ca0444 (process: wait longer on kill
# per assigned Hostdev)
with excutils.save_and_reraise_exception() as ctxt:
# Try up to 3 times before giving up.
if attempt < 3:
ctxt.reraise = False
self._destroy(instance, attempt + 1)
return
if not self._host.has_min_version(
MIN_LIBVIRT_BETTER_SIGKILL_HANDLING):
LOG.warning('Error from libvirt during '
'destroy. Code=%(errcode)s '
'Error=%(e)s; attempt '
'%(attempt)d of 6 ',
{'errcode': errcode, 'e': e,
'attempt': attempt},
instance=instance)
# Try up to 6 times before giving up.
if attempt < 6:
ctxt.reraise = False
self._destroy(instance, attempt + 1)
return
if not is_okay:
with excutils.save_and_reraise_exception():