libvirt: Rework 'EBUSY' (SIGKILL) error handling code path

Change ID I128bf6b939 (libvirt: handle code=38 + sigkill (ebusy) in
_destroy()) handled the case where a QEMU process "refuses to die" within
a given timeout period set by libvirt.

Originally, libvirt sent SIGTERM (allowing the process to clean-up
resources), then waited 10 seconds, if the guest didn't go away.  Then
it sent, the more lethal, SIGKILL and waited another 5 seconds for it to
take effect.

From libvirt v4.7.0 onwards, libvirt increased[1][2] the time it waits
for a guest hard shutdown to complete.  It now waits for 30 seconds for
SIGKILL to work (instead of 5).  Also, additional wait time is added if
there are assigned PCI devices, as some of those tend to slow things
down.

In this change:

  - Increment the counter to retry the _destroy() call from 3 to 6, thus
    increasing the total time from 15 to 30 seconds, before SIGKILL
    takes effect.  And it matches the (more graceful) behaviour of
    libvirt v4.7.0.  This also gives breathing room for Nova instances
    running in environments with large compute nodes with high instance
    creation or delete churn, where the current timout may not be
    sufficient.

  - Retry the _destroy() API call _only_ if MIN_LIBVIRT_VERSION is lower
    than 4.7.0.

[1] https://libvirt.org/git/?p=libvirt.git;a=commitdiff;h=9a4e4b9
    (process: wait longer 5->30s on hard shutdown)
[2] https://libvirt.org/git/?p=libvirt.git;a=commit;h=be2ca04 ("process:
    wait longer on kill per assigned Hostdev")

Related-bug: #1353939

Change-Id: If2035cac931c42c440d61ba97ebc7e9e92141a28
Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
This commit is contained in:
Kashyap Chamarthy 2019-02-25 13:26:24 +01:00
parent d4f58f5eb6
commit 10d50ca4e2
2 changed files with 67 additions and 13 deletions

View File

@ -14883,7 +14883,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
instance)
def test_private_destroy_ebusy_timeout(self):
# Tests that _destroy will retry 3 times to destroy the guest when an
# Tests that _destroy will retry 6 times to destroy the guest when an
# EBUSY is raised, but eventually times out and raises the libvirtError
ex = fakelibvirt.make_libvirtError(
fakelibvirt.libvirtError,
@ -14903,7 +14903,7 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self.assertRaises(fakelibvirt.libvirtError, drvr._destroy,
instance)
self.assertEqual(3, mock_guest.poweroff.call_count)
self.assertEqual(6, mock_guest.poweroff.call_count)
def test_private_destroy_ebusy_multiple_attempt_ok(self):
# Tests that the _destroy attempt loop is broken when EBUSY is no
@ -14929,6 +14929,37 @@ class LibvirtConnTestCase(test.NoDBTestCase,
self.assertEqual(2, mock_guest.poweroff.call_count)
@mock.patch.object(libvirt_driver.LOG, 'warning')
@mock.patch.object(fakelibvirt.Connection, 'getLibVersion',
return_value=versionutils.convert_version_to_int(
libvirt_driver.MIN_LIBVIRT_BETTER_SIGKILL_HANDLING))
def test_min_libvirt_better_sigkill_handling_warning(self,
mock_warning,
mock_get_libversion):
ex = fakelibvirt.make_libvirtError(
fakelibvirt.libvirtError,
("Failed to terminate process 26425 with SIGKILL: "
"Device or resource busy"),
error_code=fakelibvirt.VIR_ERR_SYSTEM_ERROR,
int1=errno.EBUSY)
mock_guest = mock.Mock(libvirt_guest.Guest, id=1)
mock_guest.poweroff = mock.Mock(side_effect=ex)
instance = objects.Instance(**self.test_instance)
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
with mock.patch.object(drvr._host, 'get_guest',
return_value=mock_guest):
raised = self.assertRaises(fakelibvirt.libvirtError,
drvr._destroy,
instance)
self.assertEqual(fakelibvirt.VIR_ERR_SYSTEM_ERROR,
raised.get_error_code())
mock_warning.assert_called_once()
mock_guest.poweroff.assert_called_once()
@mock.patch.object(fakelibvirt.libvirtError, 'get_error_code')
@mock.patch.object(host.Host, '_get_domain',
side_effect=exception.InstanceNotFound(

View File

@ -282,6 +282,11 @@ MIN_QEMU_FILE_BACKED_DISCARD_VERSION = (2, 10, 0)
MIN_LIBVIRT_NATIVE_TLS_VERSION = (4, 4, 0)
MIN_QEMU_NATIVE_TLS_VERSION = (2, 11, 0)
# If the host has this libvirt version, then we skip the retry loop of
# instance destroy() call, as libvirt itself increased the wait time
# before the SIGKILL signal takes effect.
MIN_LIBVIRT_BETTER_SIGKILL_HANDLING = (4, 7, 0)
VGPU_RESOURCE_SEMAPHORE = "vgpu_resources"
@ -1005,18 +1010,36 @@ class LibvirtDriver(driver.ComputeDriver):
# steal time from the cloud host. ie 15 wallclock
# seconds may have passed, but the VM might have only
# have a few seconds of scheduled run time.
LOG.warning('Error from libvirt during destroy. '
'Code=%(errcode)s Error=%(e)s; '
'attempt %(attempt)d of 3',
{'errcode': errcode, 'e': e,
'attempt': attempt},
instance=instance)
#
# TODO(kchamart): Once MIN_LIBVIRT_VERSION
# reaches v4.7.0, (a) rewrite the above note,
# and (b) remove the following code that retries
# _destroy() API call (which gives SIGKILL 30
# seconds to take effect) -- because from v4.7.0
# onwards, libvirt _automatically_ increases the
# timeout to 30 seconds. This was added in the
# following libvirt commits:
#
# - 9a4e4b942 (process: wait longer 5->30s on
# hard shutdown)
#
# - be2ca0444 (process: wait longer on kill
# per assigned Hostdev)
with excutils.save_and_reraise_exception() as ctxt:
# Try up to 3 times before giving up.
if attempt < 3:
ctxt.reraise = False
self._destroy(instance, attempt + 1)
return
if not self._host.has_min_version(
MIN_LIBVIRT_BETTER_SIGKILL_HANDLING):
LOG.warning('Error from libvirt during '
'destroy. Code=%(errcode)s '
'Error=%(e)s; attempt '
'%(attempt)d of 6 ',
{'errcode': errcode, 'e': e,
'attempt': attempt},
instance=instance)
# Try up to 6 times before giving up.
if attempt < 6:
ctxt.reraise = False
self._destroy(instance, attempt + 1)
return
if not is_okay:
with excutils.save_and_reraise_exception():