libvirt: retry to undefine network filters during _post_live_migration

Sometimes post live migration fails because libvirt raises an error
saying the network filter is still in use.

Use the live_migration_retry_count config option (like in
pre-live-migrate) to retry the operation until it's successful or we
timeout.

Also adds some debug logging to _post_live_migration in the compute
manager before calling driver.unfilter_instance and driver.cleanup,
which calls unfilter_instance, so that when we hit this we can see
which path we're coming from.

Closes-Bug: #1438803

Change-Id: Idffbe2857fbb23fafab1591dea82f5d64edac4bc
This commit is contained in:
Matt Riedemann 2015-03-31 14:38:47 -07:00
parent 326ebe1aad
commit 20a95915c9
3 changed files with 95 additions and 11 deletions

View File

@ -5357,6 +5357,8 @@ class ComputeManager(manager.Manager):
"live_migration._post.start",
network_info=network_info)
# Releasing security group ingress rule.
LOG.debug('Calling driver.unfilter_instance from _post_live_migration',
instance=instance)
self.driver.unfilter_instance(instance,
network_info)
@ -5386,6 +5388,8 @@ class ComputeManager(manager.Manager):
block_migration, migrate_data)
if do_cleanup:
LOG.debug('Calling driver.cleanup from _post_live_migration',
instance=instance)
self.driver.cleanup(ctxt, instance, network_info,
destroy_disks=destroy_disks,
migrate_data=migrate_data,

View File

@ -17,6 +17,7 @@ import re
import uuid
from xml.dom import minidom
from eventlet import greenthread
from lxml import etree
import mock
from mox3 import mox
@ -515,6 +516,7 @@ class IptablesFirewallTestCase(test.NoDBTestCase):
self.assertEqual(1, len(rules))
@mock.patch.object(firewall, 'libvirt', fakelibvirt)
class NWFilterTestCase(test.NoDBTestCase):
def setUp(self):
super(NWFilterTestCase, self).setUp()
@ -636,6 +638,66 @@ class NWFilterTestCase(test.NoDBTestCase):
self.fw.unfilter_instance(instance_ref, network_info)
self.assertEqual(original_filter_count - len(fakefilter.filters), 1)
@mock.patch.object(fakelibvirt.virConnect, "nwfilterLookupByName")
@mock.patch.object(greenthread, 'sleep')
def test_unfilter_instance_retry_and_error(self, mock_sleep, mock_lookup):
# Tests that we try to undefine the network filter when it's in use
# until we hit a timeout. We try two times and sleep once in between.
self.flags(live_migration_retry_count=2)
in_use = fakelibvirt.libvirtError('nwfilter is in use')
in_use.err = (fakelibvirt.VIR_ERR_OPERATION_INVALID,)
mock_undefine = mock.Mock(side_effect=in_use)
fakefilter = mock.MagicMock(undefine=mock_undefine)
mock_lookup.return_value = fakefilter
instance_ref = self._create_instance()
network_info = _fake_network_info(self.stubs, 1)
self.assertRaises(fakelibvirt.libvirtError, self.fw.unfilter_instance,
instance_ref, network_info)
self.assertEqual(2, mock_lookup.call_count)
self.assertEqual(2, mock_undefine.call_count)
mock_sleep.assert_called_once_with(1)
@mock.patch.object(fakelibvirt.virConnect, "nwfilterLookupByName")
@mock.patch.object(greenthread, 'sleep')
def test_unfilter_instance_retry_not_found(self, mock_sleep, mock_lookup):
# Tests that we exit if the nw filter is not found.
in_use = fakelibvirt.libvirtError('nwfilter is in use')
in_use.err = (fakelibvirt.VIR_ERR_OPERATION_INVALID,)
not_found = fakelibvirt.libvirtError('no nwfilter with matching name')
not_found.err = (fakelibvirt.VIR_ERR_NO_NWFILTER,)
mock_undefine = mock.Mock(side_effect=(in_use, not_found))
fakefilter = mock.MagicMock(undefine=mock_undefine)
mock_lookup.return_value = fakefilter
instance_ref = self._create_instance()
network_info = _fake_network_info(self.stubs, 1)
self.fw.unfilter_instance(instance_ref, network_info)
self.assertEqual(2, mock_lookup.call_count)
self.assertEqual(2, mock_undefine.call_count)
mock_sleep.assert_called_once_with(1)
@mock.patch.object(fakelibvirt.virConnect, "nwfilterLookupByName")
@mock.patch.object(greenthread, 'sleep')
def test_unfilter_instance_retry_and_pass(self, mock_sleep, mock_lookup):
# Tests that we retry on in-use error but pass if undefine() works
# while looping.
in_use = fakelibvirt.libvirtError('nwfilter is in use')
in_use.err = (fakelibvirt.VIR_ERR_OPERATION_INVALID,)
mock_undefine = mock.Mock(side_effect=(in_use, None))
fakefilter = mock.MagicMock(undefine=mock_undefine)
mock_lookup.return_value = fakefilter
instance_ref = self._create_instance()
network_info = _fake_network_info(self.stubs, 1)
self.fw.unfilter_instance(instance_ref, network_info)
self.assertEqual(2, mock_lookup.call_count)
self.assertEqual(2, mock_undefine.call_count)
mock_sleep.assert_called_once_with(1)
def test_redefining_nwfilters(self):
fakefilter = NWFilterFakes()
self.fw._conn.nwfilterDefineXML = fakefilter.filterDefineXMLMock

View File

@ -17,6 +17,7 @@
import uuid
from eventlet import greenthread
from lxml import etree
from oslo_config import cfg
from oslo_log import log as logging
@ -31,6 +32,7 @@ from nova.virt import netutils
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
CONF.import_opt('use_ipv6', 'nova.netconf')
CONF.import_opt('live_migration_retry_count', 'nova.compute.manager')
libvirt = None
@ -269,17 +271,33 @@ class NWFilterFirewall(base_firewall.FirewallDriver):
nic_id = vif['address'].replace(':', '')
instance_filter_name = self._instance_filter_name(instance, nic_id)
try:
_nw = self._conn.nwfilterLookupByName(instance_filter_name)
_nw.undefine()
except libvirt.libvirtError as e:
errcode = e.get_error_code()
if errcode == libvirt.VIR_ERR_OPERATION_INVALID:
# This happens when the instance filter is still in
# use (ie. when the instance has not terminated properly)
raise
LOG.debug('The nwfilter(%s) is not found.',
instance_filter_name, instance=instance)
# nwfilters may be defined in a separate thread in the case
# of libvirt non-blocking mode, so we wait for completion
max_retry = CONF.live_migration_retry_count
for cnt in range(max_retry):
try:
_nw = self._conn.nwfilterLookupByName(instance_filter_name)
_nw.undefine()
break
except libvirt.libvirtError as e:
if cnt == max_retry - 1:
raise
errcode = e.get_error_code()
if errcode == libvirt.VIR_ERR_OPERATION_INVALID:
# This happens when the instance filter is still in use
# (ie. when the instance has not terminated properly)
LOG.info(_LI('Failed to undefine network filter '
'%(name)s. Try %(cnt)d of '
'%(max_retry)d.'),
{'name': instance_filter_name,
'cnt': cnt + 1,
'max_retry': max_retry},
instance=instance)
greenthread.sleep(1)
else:
LOG.debug('The nwfilter(%s) is not found.',
instance_filter_name, instance=instance)
break
@staticmethod
def _instance_filter_name(instance, nic_id=None):