From d3360948f6dde1639200829a946a434f83a91f6e Mon Sep 17 00:00:00 2001
From: Lucian Petrut <lpetrut@cloudbasesolutions.com>
Date: Thu, 18 Oct 2018 17:49:22 +0300
Subject: [PATCH] Retry plugging ports when clustered instances start

After a while, the Failover Cluster will stop retrying when attempting
to bring back up failed instances. For example, if the CSV is down
more than a few minutes, the cluster groups will be set in "Failed"
state, while the VMs won't be registered on any Hyper-V node.

The issue is that we're only handling cluster group owner changes
(moved instances). If the admin fixes the issue and manually brings
the cluster groups back up, the instances are recreated but we aren't
handling this, so ports won't get reconnected.

This change will double check the ports when clustered instances
start.

Closes-Bug: #1799163

Change-Id: I5caa65d7b7922dc9632b18acedaf1aedeec3fcc3
---
 compute_hyperv/nova/cluster/clusterops.py        | 12 ++++++++++++
 compute_hyperv/nova/cluster/driver.py            |  6 ++++++
 .../tests/unit/cluster/test_clusterops.py        | 16 ++++++++++++++++
 compute_hyperv/tests/unit/cluster/test_driver.py |  9 +++++++++
 4 files changed, 43 insertions(+)

diff --git a/compute_hyperv/nova/cluster/clusterops.py b/compute_hyperv/nova/cluster/clusterops.py
index 7b46e1d1..65112e97 100644
--- a/compute_hyperv/nova/cluster/clusterops.py
+++ b/compute_hyperv/nova/cluster/clusterops.py
@@ -25,6 +25,7 @@ from nova import network
 from nova import objects
 from nova import utils
 from nova.virt import block_device
+from nova.virt import event as virtevent
 from os_win import exceptions as os_win_exc
 from os_win import utilsfactory
 from oslo_log import log as logging
@@ -230,3 +231,14 @@ class ClusterOps(object):
         instance.host = new_host
         instance.node = new_host
         instance.save(expected_task_state=[None])
+
+    def instance_state_change_callback(self, event):
+        if event.transition == virtevent.EVENT_LIFECYCLE_STARTED:
+            # In some cases, we may not be able to plug the vifs when the
+            # instances are failed over (e.g. if the instances end up in
+            # "failed" state, without actually being registered in Hyper-V,
+            # being brought back online afterwards)
+            instance = self._get_instance_by_name(event.name)
+            nw_info = self._network_api.get_instance_nw_info(self._context,
+                                                             instance)
+            self._vmops.plug_vifs(instance, nw_info)
diff --git a/compute_hyperv/nova/cluster/driver.py b/compute_hyperv/nova/cluster/driver.py
index 5e44ffbf..cbc1a984 100644
--- a/compute_hyperv/nova/cluster/driver.py
+++ b/compute_hyperv/nova/cluster/driver.py
@@ -31,6 +31,12 @@ class HyperVClusterDriver(driver.HyperVDriver):
         self._clops.start_failover_listener_daemon()
         self._clops.reclaim_failovered_instances()
 
+    def _set_event_handler_callbacks(self):
+        super(HyperVClusterDriver, self)._set_event_handler_callbacks()
+
+        self._event_handler.add_callback(
+            self._clops.instance_state_change_callback)
+
     def spawn(self, context, instance, image_meta, injected_files,
               admin_password, allocations, network_info=None,
               block_device_info=None):
diff --git a/compute_hyperv/tests/unit/cluster/test_clusterops.py b/compute_hyperv/tests/unit/cluster/test_clusterops.py
index d6971ee0..f3590f78 100644
--- a/compute_hyperv/tests/unit/cluster/test_clusterops.py
+++ b/compute_hyperv/tests/unit/cluster/test_clusterops.py
@@ -20,6 +20,7 @@ from nova.compute import task_states
 from nova.compute import vm_states
 from nova.network.neutronv2 import api as network_api
 from nova import objects
+from nova.virt import event as virtevent
 from os_win import exceptions as os_win_exc
 
 from compute_hyperv.nova.cluster import clusterops
@@ -50,6 +51,7 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
         self.clusterops = clusterops.ClusterOps()
         self.clusterops._context = self.context
         self._clustutils = self.clusterops._clustutils
+        self._network_api = self.clusterops._network_api
 
     def test_get_instance_host(self):
         mock_instance = fake_instance.fake_instance_obj(self.context)
@@ -341,3 +343,17 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
         self.assertEqual(mock.sentinel.host, mock_instance.host)
         self.assertEqual(mock.sentinel.host, mock_instance.node)
         mock_instance.save.assert_called_once_with(expected_task_state=[None])
+
+    @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
+    def test_instance_state_change_callback(self, mock_get_instance_by_name):
+        event = mock.Mock(transition=virtevent.EVENT_LIFECYCLE_STARTED)
+        mock_instance = mock_get_instance_by_name.return_value
+
+        self.clusterops.instance_state_change_callback(event)
+
+        mock_get_instance_by_name.assert_called_once_with(event.name)
+        self._network_api.get_instance_nw_info.assert_called_once_with(
+            self.context, mock_instance)
+        self.clusterops._vmops.plug_vifs.assert_called_once_with(
+            mock_instance,
+            self._network_api.get_instance_nw_info.return_value)
diff --git a/compute_hyperv/tests/unit/cluster/test_driver.py b/compute_hyperv/tests/unit/cluster/test_driver.py
index aaa339ac..db045874 100644
--- a/compute_hyperv/tests/unit/cluster/test_driver.py
+++ b/compute_hyperv/tests/unit/cluster/test_driver.py
@@ -29,6 +29,7 @@ class HyperVClusterTestCase(test_base.HyperVBaseTestCase):
 
     _autospec_classes = [
         driver.clusterops.ClusterOps,
+        base_driver.eventhandler.InstanceEventHandler,
         base_driver.hostops.api.API,
         driver.livemigrationops.ClusterLiveMigrationOps,
     ]
@@ -57,6 +58,14 @@ class HyperVClusterTestCase(test_base.HyperVBaseTestCase):
         self.assertPublicAPISignatures(nova_base_driver.ComputeDriver,
                                        driver.HyperVClusterDriver)
 
+    def test_set_event_handler_callbacks(self):
+        self.driver._set_event_handler_callbacks()
+
+        self.driver._event_handler.add_callback.assert_has_calls(
+            [mock.call(self.driver.emit_event),
+             mock.call(self.driver._vmops.instance_state_change_callback),
+             mock.call(self.driver._clops.instance_state_change_callback)])
+
     @mock.patch.object(base_driver.HyperVDriver, 'spawn')
     def test_spawn(self, mock_superclass_spawn):
         self.driver.spawn(self.context, mock.sentinel.fake_instance,