libvirt: check job status for VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED event
Change Ic5cab99944df9e501ba2032eb96911c36304494d added handling for
the VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED event during live migration
but failed to distinguish between the live migration actually succeeding
or failing before queueing the EVENT_LIFECYCLE_MIGRATION_COMPLETED
up into the ComputeManager.handle_lifecycle_event method.
As a result, failed live migrations will inadvertantly trigger
activation of the port bindings on the destination host, which
deactivates the source host port bindings, and then
_rollback_live_migration will delete those activated dest host port
bindings and leave the source host port bindings deactivated.
In this change, if we get the VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED
event, we attempt to get the job status to determine the course to
take and only queue the EVENT_LIFECYCLE_MIGRATION_COMPLETED event,
which triggers the dest host port activation, if we can determine
the live migration job completed successfully. Otherwise we simply
report the guest as paused, the same as before Ic5cab9994.
Change-Id: I6a4252b0c12c41c233299f30ce8294fef21c7b40
Closes-Bug: #1788014
(cherry picked from commit aa87b9c288
)
This commit is contained in:
parent
48bb9a9663
commit
27bfd0bc62
@ -75,6 +75,7 @@ VIR_DOMAIN_EVENT_STOPPED = 5
|
||||
VIR_DOMAIN_EVENT_SHUTDOWN = 6
|
||||
VIR_DOMAIN_EVENT_PMSUSPENDED = 7
|
||||
|
||||
VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED = 1
|
||||
VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY = 7
|
||||
|
||||
VIR_DOMAIN_UNDEFINE_MANAGED_SAVE = 1
|
||||
|
@ -216,7 +216,8 @@ class HostTestCase(test.NoDBTestCase):
|
||||
self.assertEqual(event.EVENT_LIFECYCLE_POSTCOPY_STARTED,
|
||||
expected_event.transition)
|
||||
|
||||
def test_event_lifecycle_callback_suspended_migrated(self):
|
||||
@mock.patch('nova.virt.libvirt.guest.Guest.get_job_info')
|
||||
def test_event_lifecycle_callback_suspended_migrated(self, get_job_info):
|
||||
"""Tests the suspended lifecycle event with libvirt with migrated"""
|
||||
hostimpl = mock.MagicMock()
|
||||
conn = mock.MagicMock()
|
||||
@ -226,22 +227,47 @@ class HostTestCase(test.NoDBTestCase):
|
||||
</domain>
|
||||
"""
|
||||
dom = fakelibvirt.Domain(conn, fake_dom_xml, running=True)
|
||||
# See https://libvirt.org/html/libvirt-libvirt-domain.html for values.
|
||||
VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED = 1
|
||||
with mock.patch.object(host.libvirt,
|
||||
'VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED', new=1,
|
||||
create=True):
|
||||
host.Host._event_lifecycle_callback(
|
||||
conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED,
|
||||
detail=VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED, opaque=hostimpl)
|
||||
jobinfo = libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED)
|
||||
get_job_info.return_value = jobinfo
|
||||
host.Host._event_lifecycle_callback(
|
||||
conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED,
|
||||
detail=fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED,
|
||||
opaque=hostimpl)
|
||||
expected_event = hostimpl._queue_event.call_args[0][0]
|
||||
self.assertEqual(event.EVENT_LIFECYCLE_MIGRATION_COMPLETED,
|
||||
expected_event.transition)
|
||||
get_job_info.assert_called_once_with()
|
||||
|
||||
@mock.patch('nova.virt.libvirt.guest.Guest.get_job_info')
|
||||
@mock.patch('nova.virt.libvirt.migration.find_job_type')
|
||||
def test_event_lifecycle_callback_suspended_migrated_job_failed(
|
||||
self, find_job_type, get_job_info):
|
||||
"""Tests the suspended lifecycle event with libvirt with migrated"""
|
||||
hostimpl = mock.MagicMock()
|
||||
conn = mock.MagicMock()
|
||||
fake_dom_xml = """
|
||||
<domain type='kvm'>
|
||||
<uuid>cef19ce0-0ca2-11df-855d-b19fbce37686</uuid>
|
||||
</domain>
|
||||
"""
|
||||
dom = fakelibvirt.Domain(conn, fake_dom_xml, running=True)
|
||||
jobinfo = libvirt_guest.JobInfo(type=fakelibvirt.VIR_DOMAIN_JOB_NONE)
|
||||
get_job_info.return_value = jobinfo
|
||||
# If the job type is VIR_DOMAIN_JOB_NONE we'll attempt to figure out
|
||||
# the actual job status, so in this case we mock it to be a failure.
|
||||
find_job_type.return_value = fakelibvirt.VIR_DOMAIN_JOB_FAILED
|
||||
host.Host._event_lifecycle_callback(
|
||||
conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED,
|
||||
detail=fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED,
|
||||
opaque=hostimpl)
|
||||
expected_event = hostimpl._queue_event.call_args[0][0]
|
||||
# FIXME(mriedem): This should be EVENT_LIFECYCLE_MIGRATION_COMPLETED
|
||||
# once bug 1788014 is fixed and we properly check job status for the
|
||||
# VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED case.
|
||||
# self.assertEqual(event.EVENT_LIFECYCLE_MIGRATION_COMPLETED,
|
||||
# expected_event.transition)
|
||||
self.assertEqual(event.EVENT_LIFECYCLE_PAUSED,
|
||||
expected_event.transition)
|
||||
get_job_info.assert_called_once_with()
|
||||
find_job_type.assert_called_once_with(
|
||||
test.MatchType(libvirt_guest.Guest), instance=None,
|
||||
logging_ok=False)
|
||||
|
||||
def test_event_emit_delayed_call_delayed(self):
|
||||
ev = event.LifecycleEvent(
|
||||
|
@ -1039,6 +1039,14 @@ class MigrationMonitorTestCase(test.NoDBTestCase):
|
||||
self.assertEqual(migration.find_job_type(self.guest, self.instance),
|
||||
fakelibvirt.VIR_DOMAIN_JOB_FAILED)
|
||||
|
||||
@mock.patch('nova.virt.libvirt.migration.LOG',
|
||||
new_callable=mock.NonCallableMock) # asserts not called
|
||||
@mock.patch('nova.virt.libvirt.guest.Guest.is_active', return_value=True)
|
||||
def test_live_migration_find_type_no_logging(self, mock_active, _mock_log):
|
||||
self.assertEqual(fakelibvirt.VIR_DOMAIN_JOB_FAILED,
|
||||
migration.find_job_type(self.guest, self.instance,
|
||||
logging_ok=False))
|
||||
|
||||
def test_live_migration_abort_too_long(self):
|
||||
# Elapsed time is over completion timeout
|
||||
self.assertTrue(migration.should_trigger_timeout_action(
|
||||
|
@ -58,6 +58,7 @@ from nova import utils
|
||||
from nova.virt import event as virtevent
|
||||
from nova.virt.libvirt import config as vconfig
|
||||
from nova.virt.libvirt import guest as libvirt_guest
|
||||
from nova.virt.libvirt import migration as libvirt_migrate
|
||||
from nova.virt.libvirt import utils as libvirt_utils
|
||||
|
||||
libvirt = None
|
||||
@ -214,12 +215,27 @@ class Host(object):
|
||||
elif event == libvirt.VIR_DOMAIN_EVENT_SUSPENDED:
|
||||
if detail == libvirt.VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY:
|
||||
transition = virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED
|
||||
# FIXME(mriedem): VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED is also sent
|
||||
# when live migration of the guest fails, so we cannot simply rely
|
||||
# on the event itself but need to check if the job itself was
|
||||
# successful.
|
||||
# elif detail == libvirt.VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED:
|
||||
# transition = virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED
|
||||
elif detail == libvirt.VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED:
|
||||
# VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED is also sent when live
|
||||
# migration of the guest fails, so we cannot simply rely
|
||||
# on the event itself but need to check if the job itself was
|
||||
# successful.
|
||||
# NOTE(mriedem): The job check logic here is copied from
|
||||
# LibvirtDriver._live_migration_monitor.
|
||||
guest = libvirt_guest.Guest(dom)
|
||||
info = guest.get_job_info()
|
||||
if info.type == libvirt.VIR_DOMAIN_JOB_NONE:
|
||||
# Either still running, or failed or completed,
|
||||
# lets untangle the mess.
|
||||
info.type = libvirt_migrate.find_job_type(
|
||||
guest, instance=None, logging_ok=False)
|
||||
|
||||
if info.type == libvirt.VIR_DOMAIN_JOB_COMPLETED:
|
||||
transition = virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED
|
||||
else:
|
||||
# Failed or some other status we don't know about, so just
|
||||
# opt to report the guest is paused.
|
||||
transition = virtevent.EVENT_LIFECYCLE_PAUSED
|
||||
else:
|
||||
transition = virtevent.EVENT_LIFECYCLE_PAUSED
|
||||
elif event == libvirt.VIR_DOMAIN_EVENT_RESUMED:
|
||||
|
@ -384,11 +384,13 @@ def _update_vif_xml(xml_doc, migrate_data, get_vif_config):
|
||||
return xml_doc
|
||||
|
||||
|
||||
def find_job_type(guest, instance):
|
||||
def find_job_type(guest, instance, logging_ok=True):
|
||||
"""Determine the (likely) current migration job type
|
||||
|
||||
:param guest: a nova.virt.libvirt.guest.Guest
|
||||
:param instance: a nova.objects.Instance
|
||||
:param logging_ok: If logging in this method is OK. If called from a
|
||||
native thread then logging is generally prohibited.
|
||||
|
||||
Annoyingly when job type == NONE and migration is
|
||||
no longer running, we don't know whether we stopped
|
||||
@ -398,25 +400,29 @@ def find_job_type(guest, instance):
|
||||
|
||||
:returns: a libvirt job type constant
|
||||
"""
|
||||
def _log(func, msg, *args, **kwargs):
|
||||
if logging_ok:
|
||||
func(msg, *args, **kwargs)
|
||||
|
||||
try:
|
||||
if guest.is_active():
|
||||
LOG.debug("VM running on src, migration failed",
|
||||
instance=instance)
|
||||
_log(LOG.debug, "VM running on src, migration failed",
|
||||
instance=instance)
|
||||
return libvirt.VIR_DOMAIN_JOB_FAILED
|
||||
else:
|
||||
LOG.debug("VM is shutoff, migration finished",
|
||||
instance=instance)
|
||||
_log(LOG.debug, "VM is shutoff, migration finished",
|
||||
instance=instance)
|
||||
return libvirt.VIR_DOMAIN_JOB_COMPLETED
|
||||
except libvirt.libvirtError as ex:
|
||||
LOG.debug("Error checking domain status %(ex)s",
|
||||
{"ex": ex}, instance=instance)
|
||||
_log(LOG.debug, "Error checking domain status %(ex)s", {"ex": ex},
|
||||
instance=instance)
|
||||
if ex.get_error_code() == libvirt.VIR_ERR_NO_DOMAIN:
|
||||
LOG.debug("VM is missing, migration finished",
|
||||
instance=instance)
|
||||
_log(LOG.debug, "VM is missing, migration finished",
|
||||
instance=instance)
|
||||
return libvirt.VIR_DOMAIN_JOB_COMPLETED
|
||||
else:
|
||||
LOG.info("Error %(ex)s, migration failed",
|
||||
{"ex": ex}, instance=instance)
|
||||
_log(LOG.info, "Error %(ex)s, migration failed", {"ex": ex},
|
||||
instance=instance)
|
||||
return libvirt.VIR_DOMAIN_JOB_FAILED
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user