From d3360948f6dde1639200829a946a434f83a91f6e Mon Sep 17 00:00:00 2001 From: Lucian Petrut Date: Thu, 18 Oct 2018 17:49:22 +0300 Subject: [PATCH] Retry plugging ports when clustered instances start After a while, the Failover Cluster will stop retrying when attempting to bring back up failed instances. For example, if the CSV is down more than a few minutes, the cluster groups will be set in "Failed" state, while the VMs won't be registered on any Hyper-V node. The issue is that we're only handling cluster group owner changes (moved instances). If the admin fixes the issue and manually brings the cluster groups back up, the instances are recreated but we aren't handling this, so ports won't get reconnected. This change will double check the ports when clustered instances start. Closes-Bug: #1799163 Change-Id: I5caa65d7b7922dc9632b18acedaf1aedeec3fcc3 --- compute_hyperv/nova/cluster/clusterops.py | 12 ++++++++++++ compute_hyperv/nova/cluster/driver.py | 6 ++++++ .../tests/unit/cluster/test_clusterops.py | 16 ++++++++++++++++ compute_hyperv/tests/unit/cluster/test_driver.py | 9 +++++++++ 4 files changed, 43 insertions(+) diff --git a/compute_hyperv/nova/cluster/clusterops.py b/compute_hyperv/nova/cluster/clusterops.py index 7b46e1d1..65112e97 100644 --- a/compute_hyperv/nova/cluster/clusterops.py +++ b/compute_hyperv/nova/cluster/clusterops.py @@ -25,6 +25,7 @@ from nova import network from nova import objects from nova import utils from nova.virt import block_device +from nova.virt import event as virtevent from os_win import exceptions as os_win_exc from os_win import utilsfactory from oslo_log import log as logging @@ -230,3 +231,14 @@ class ClusterOps(object): instance.host = new_host instance.node = new_host instance.save(expected_task_state=[None]) + + def instance_state_change_callback(self, event): + if event.transition == virtevent.EVENT_LIFECYCLE_STARTED: + # In some cases, we may not be able to plug the vifs when the + # instances are failed over (e.g. if the instances end up in + # "failed" state, without actually being registered in Hyper-V, + # being brought back online afterwards) + instance = self._get_instance_by_name(event.name) + nw_info = self._network_api.get_instance_nw_info(self._context, + instance) + self._vmops.plug_vifs(instance, nw_info) diff --git a/compute_hyperv/nova/cluster/driver.py b/compute_hyperv/nova/cluster/driver.py index 5e44ffbf..cbc1a984 100644 --- a/compute_hyperv/nova/cluster/driver.py +++ b/compute_hyperv/nova/cluster/driver.py @@ -31,6 +31,12 @@ class HyperVClusterDriver(driver.HyperVDriver): self._clops.start_failover_listener_daemon() self._clops.reclaim_failovered_instances() + def _set_event_handler_callbacks(self): + super(HyperVClusterDriver, self)._set_event_handler_callbacks() + + self._event_handler.add_callback( + self._clops.instance_state_change_callback) + def spawn(self, context, instance, image_meta, injected_files, admin_password, allocations, network_info=None, block_device_info=None): diff --git a/compute_hyperv/tests/unit/cluster/test_clusterops.py b/compute_hyperv/tests/unit/cluster/test_clusterops.py index d6971ee0..f3590f78 100644 --- a/compute_hyperv/tests/unit/cluster/test_clusterops.py +++ b/compute_hyperv/tests/unit/cluster/test_clusterops.py @@ -20,6 +20,7 @@ from nova.compute import task_states from nova.compute import vm_states from nova.network.neutronv2 import api as network_api from nova import objects +from nova.virt import event as virtevent from os_win import exceptions as os_win_exc from compute_hyperv.nova.cluster import clusterops @@ -50,6 +51,7 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase): self.clusterops = clusterops.ClusterOps() self.clusterops._context = self.context self._clustutils = self.clusterops._clustutils + self._network_api = self.clusterops._network_api def test_get_instance_host(self): mock_instance = fake_instance.fake_instance_obj(self.context) @@ -341,3 +343,17 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase): self.assertEqual(mock.sentinel.host, mock_instance.host) self.assertEqual(mock.sentinel.host, mock_instance.node) mock_instance.save.assert_called_once_with(expected_task_state=[None]) + + @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name') + def test_instance_state_change_callback(self, mock_get_instance_by_name): + event = mock.Mock(transition=virtevent.EVENT_LIFECYCLE_STARTED) + mock_instance = mock_get_instance_by_name.return_value + + self.clusterops.instance_state_change_callback(event) + + mock_get_instance_by_name.assert_called_once_with(event.name) + self._network_api.get_instance_nw_info.assert_called_once_with( + self.context, mock_instance) + self.clusterops._vmops.plug_vifs.assert_called_once_with( + mock_instance, + self._network_api.get_instance_nw_info.return_value) diff --git a/compute_hyperv/tests/unit/cluster/test_driver.py b/compute_hyperv/tests/unit/cluster/test_driver.py index aaa339ac..db045874 100644 --- a/compute_hyperv/tests/unit/cluster/test_driver.py +++ b/compute_hyperv/tests/unit/cluster/test_driver.py @@ -29,6 +29,7 @@ class HyperVClusterTestCase(test_base.HyperVBaseTestCase): _autospec_classes = [ driver.clusterops.ClusterOps, + base_driver.eventhandler.InstanceEventHandler, base_driver.hostops.api.API, driver.livemigrationops.ClusterLiveMigrationOps, ] @@ -57,6 +58,14 @@ class HyperVClusterTestCase(test_base.HyperVBaseTestCase): self.assertPublicAPISignatures(nova_base_driver.ComputeDriver, driver.HyperVClusterDriver) + def test_set_event_handler_callbacks(self): + self.driver._set_event_handler_callbacks() + + self.driver._event_handler.add_callback.assert_has_calls( + [mock.call(self.driver.emit_event), + mock.call(self.driver._vmops.instance_state_change_callback), + mock.call(self.driver._clops.instance_state_change_callback)]) + @mock.patch.object(base_driver.HyperVDriver, 'spawn') def test_spawn(self, mock_superclass_spawn): self.driver.spawn(self.context, mock.sentinel.fake_instance,