Improve clustered instance failover handling

Instances can bounce between hosts many times in a short interval, especially if the CSVs go down (as much as 20 times in less than 2 minutes). We're not handling this properly. The failover handling logic is prone to race conditions, as multiple hosts may attempt to claim the instance, which will end up in an inconsistent state. We're introducing distributed locks, preventing races between hosts. At the same time, we're validating the events, as the instances can move again by the time we process the event. The distributed lock backend will have to be configured. At the same time, we're now waiting for "pending" cluster groups, which may not even be registered in Hyper-V, so any action we take on the VM would fail. Closes-Bug: #1795299 Closes-Bug: #1796673 Change-Id: I3dbdcf208bb7a96bd516b41e4725a5fcb37280d6
2018-10-09 17:20:16 +03:00 · 2018-10-09 17:20:16 +03:00 · 9f5628e55b
parent c96671885f
commit 9f5628e55b
7 changed files with 150 additions and 13 deletions
--- a/compute_hyperv/nova/cluster/clusterops.py
+++ b/compute_hyperv/nova/cluster/clusterops.py
@ -16,6 +16,7 @@
 """Management class for Cluster VM operations."""

 import functools
+import time

 from nova.compute import power_state
 from nova.compute import task_states
@ -26,11 +27,13 @@ from nova import objects
 from nova import utils
 from nova.virt import block_device
 from nova.virt import event as virtevent
+from os_win import constants as os_win_const
 from os_win import exceptions as os_win_exc
 from os_win import utilsfactory
 from oslo_log import log as logging

 import compute_hyperv.nova.conf
+from compute_hyperv.nova import coordination
 from compute_hyperv.nova import hostops
 from compute_hyperv.nova import serialconsoleops
 from compute_hyperv.nova import vmops
@ -106,6 +109,7 @@ class ClusterOps(object):
                          instance.name, instance.host,
                          self._this_node)

+    @coordination.synchronized('failover-{instance_name}')
    def _failover_migrate(self, instance_name, new_host):
        """This method will check if the generated event is a legitimate
        failover to this node. If it is, it will proceed to prepare the
@ -126,6 +130,31 @@ class ClusterOps(object):
                  'new_host': new_host,
                  'old_host': old_host})

+        # While the cluster group is in "pending" state, it may not even be
+        # registered in Hyper-V, so there's not much we can do. We'll have to
+        # wait for it to be handled by the Failover Cluster service.
+        self._wait_for_pending_instance(instance_name)
+
+        current_host = self._clustutils.get_vm_host(instance_name)
+        instance_moved_again = current_host.upper() != new_host.upper()
+        if instance_moved_again:
+            LOG.warning("While processing instance %(instance)s failover to "
+                        "%(host)s, it has moved to %(current_host)s.",
+                        dict(host=new_host,
+                             current_host=current_host,
+                             instance=instance_name))
+            new_host = current_host
+
+        host_changed = old_host.upper() != new_host.upper()
+        migrated_here = new_host.upper() == self._this_node.upper()
+        migrated_from_here = old_host.upper() == self._this_node.upper()
+
+        if not host_changed:
+            LOG.warning("The source node is the same as the destination "
+                        "node: %(host)s. The instance %(instance)s may have "
+                        "bounced between hosts due to a failure.",
+                        dict(host=old_host, instance=instance_name))
+
        if instance.task_state == task_states.MIGRATING:
            # In case of live migration triggered by the user, we get the
            # event that the instance changed host but we do not want
@ -135,11 +164,11 @@ class ClusterOps(object):

        nw_info = self._network_api.get_instance_nw_info(self._context,
                                                         instance)
-        if old_host and old_host.upper() == self._this_node.upper():
-            LOG.debug('Actions at source node.')
+        if host_changed and migrated_from_here:
+            LOG.debug('Cleaning up moved instance: %s.', instance_name)
            self._vmops.unplug_vifs(instance, nw_info)
            return
-        elif new_host.upper() != self._this_node.upper():
+        if not migrated_here:
            LOG.debug('Instance %s did not failover to this node.',
                      instance_name)
            return
@ -148,10 +177,26 @@ class ClusterOps(object):
                 {'instance': instance_name})

        self._nova_failover_server(instance, new_host)
-        self._failover_migrate_networks(instance, old_host)
+        if host_changed:
+            self._failover_migrate_networks(instance, old_host)
+
        self._vmops.plug_vifs(instance, nw_info)
        self._serial_console_ops.start_console_handler(instance_name)

+    def _wait_for_pending_instance(self, instance_name):
+        # TODO(lpetrut): switch to an event listener. We'd probably want to
+        # avoid having one event listener per failed over instance, as there
+        # can be many of them.
+        group_state = self._clustutils.get_cluster_group_state_info(
+            instance_name)['state']
+        while group_state == os_win_const.CLUSTER_GROUP_PENDING:
+            LOG.debug("Waiting for pending instance cluster group: %s",
+                      instance_name)
+            time.sleep(2)
+
+            group_state = self._clustutils.get_cluster_group_state_info(
+                instance_name)['state']
+
    def _failover_migrate_networks(self, instance, source):
        """This is called after a VM failovered to this node.
        This will change the owner of the neutron ports to this node.
--- a/compute_hyperv/nova/cluster/driver.py
+++ b/compute_hyperv/nova/cluster/driver.py
@ -21,6 +21,8 @@ from compute_hyperv.nova import driver


 class HyperVClusterDriver(driver.HyperVDriver):
+    use_coordination = True
+
    def __init__(self, virtapi):
        super(HyperVClusterDriver, self).__init__(virtapi)

--- a/compute_hyperv/nova/driver.py
+++ b/compute_hyperv/nova/driver.py
@ -30,6 +30,7 @@ from os_win import utilsfactory
 from oslo_log import log as logging
 import six

+from compute_hyperv.nova import coordination
 from compute_hyperv.nova import eventhandler
 from compute_hyperv.nova import hostops
 from compute_hyperv.nova import imagecache
@ -108,6 +109,8 @@ class HyperVDriver(driver.ComputeDriver):
        "supports_trusted_certs": True,
    }

+    use_coordination = False
+
    def __init__(self, virtapi):
        # check if the current version of Windows is supported before any
        # further driver initialisation.
@ -151,6 +154,9 @@ class HyperVDriver(driver.ComputeDriver):
        return False

    def init_host(self, host):
+        if self.use_coordination:
+            coordination.COORDINATOR.start()
+
        self._serialconsoleops.start_console_handlers()

        self._set_event_handler_callbacks()
--- a/compute_hyperv/tests/test.py
+++ b/compute_hyperv/tests/test.py
@ -85,6 +85,7 @@ class NoDBTestCase(base.BaseTestCase):
    """

    TIMEOUT_SCALING_FACTOR = 1
+    MOCK_TOOZ = True

    def setUp(self):
        """Run before each test method to initialize test environment."""
@ -111,6 +112,11 @@ class NoDBTestCase(base.BaseTestCase):

        self.useFixture(nova_fixtures.PoisonFunctions())

+        if self.MOCK_TOOZ:
+            self.patch('compute_hyperv.nova.coordination.Coordinator.start')
+            self.patch('compute_hyperv.nova.coordination.Coordinator.stop')
+            self.patch('compute_hyperv.nova.coordination.Coordinator.get_lock')
+
    def _clear_attrs(self):
        # Delete attributes that don't start with _ so they don't pin
        # memory around unnecessarily for the duration of the test
@ -124,6 +130,12 @@ class NoDBTestCase(base.BaseTestCase):
        for k, v in six.iteritems(kw):
            CONF.set_override(k, v, group)

+    def patch(self, path, *args, **kwargs):
+        patcher = mock.patch(path, *args, **kwargs)
+        result = patcher.start()
+        self.addCleanup(patcher.stop)
+        return result
+
    def assertPublicAPISignatures(self, baseinst, inst):
        def get_public_apis(inst):
            methods = {}
--- a/compute_hyperv/tests/unit/cluster/test_clusterops.py
+++ b/compute_hyperv/tests/unit/cluster/test_clusterops.py
@ -21,6 +21,7 @@ from nova.compute import vm_states
 from nova.network.neutronv2 import api as network_api
 from nova import objects
 from nova.virt import event as virtevent
+from os_win import constants as os_win_const
 from os_win import exceptions as os_win_exc

 from compute_hyperv.nova.cluster import clusterops
@ -144,10 +145,12 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
            mock_instance1.name, mock_instance1.host,
            self.clusterops._this_node)

+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
    @mock.patch.object(clusterops, 'LOG')
    @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
    def test_failover_migrate_no_instance(self, mock_get_instance_by_name,
-                                          mock_LOG):
+                                          mock_LOG,
+                                          mock_wait_pending_instance):
        mock_get_instance_by_name.return_value = None

        self.clusterops._failover_migrate(mock.sentinel.instance_name,
@ -159,35 +162,40 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
        self.assertFalse(
            self.clusterops._network_api.get_instance_nw_info.called)

+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
    @mock.patch.object(clusterops, 'LOG')
    @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
    def test_failover_migrate_migrating(self, mock_get_instance_by_name,
-                                        mock_LOG):
+                                        mock_LOG, mock_wait_pending_instance):
        instance = mock_get_instance_by_name.return_value
        instance.task_state = task_states.MIGRATING

        self.clusterops._failover_migrate(mock.sentinel.instance_name,
-                                          mock.sentinel.new_host)
+                                          'new_host')

        mock_LOG.debug.assert_called_once_with(
            'Instance %s is live migrating.', mock.sentinel.instance_name)

+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
    @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
-    def test_failover_migrate_at_source_node(self, mock_get_instance_by_name):
+    def test_failover_migrate_at_source_node(self, mock_get_instance_by_name,
+                                             mock_wait_pending_instance):
        instance = mock_get_instance_by_name.return_value
        instance.host = 'old_host'
        self.clusterops._this_node = instance.host

        self.clusterops._failover_migrate(mock.sentinel.instance_name,
-                                          mock.sentinel.new_host)
+                                          'new_host')

        self.clusterops._vmops.unplug_vifs.assert_called_once_with(instance,
            self.clusterops._network_api.get_instance_nw_info.return_value)

+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
    @mock.patch.object(clusterops, 'LOG')
    @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
    def test_failover_migrate_not_this_node(self, mock_get_instance_by_name,
-                                            mock_LOG):
+                                            mock_LOG,
+                                            mock_wait_pending_instance):
        self.clusterops._this_node = 'new_host'

        self.clusterops._failover_migrate(mock.sentinel.instance_name,
@ -197,21 +205,28 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
            'Instance %s did not failover to this node.',
            mock.sentinel.instance_name)

+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
    @mock.patch.object(clusterops.ClusterOps, '_failover_migrate_networks')
    @mock.patch.object(clusterops.ClusterOps, '_nova_failover_server')
    @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
-    def test_failover_migrate_this_node(self, mock_get_instance_by_name,
-                                        mock_nova_failover_server,
-                                        mock_failover_migrate_networks):
+    def test_failover_migrate_changed_host(self, mock_get_instance_by_name,
+                                           mock_nova_failover_server,
+                                           mock_failover_migrate_networks,
+                                           mock_wait_pending_instance):
        instance = mock_get_instance_by_name.return_value
        old_host = 'old_host'
        new_host = 'new_host'
        instance.host = old_host
        self.clusterops._this_node = new_host
+        self._clustutils.get_vm_host.return_value = new_host

        self.clusterops._failover_migrate(mock.sentinel.instance_name,
                                          new_host)

+        mock_wait_pending_instance.assert_called_once_with(
+            mock.sentinel.instance_name)
+        self._clustutils.get_vm_host.assert_called_once_with(
+            mock.sentinel.instance_name)
        mock_get_instance_by_name.assert_called_once_with(
            mock.sentinel.instance_name)
        get_inst_nw_info = self.clusterops._network_api.get_instance_nw_info
@ -225,6 +240,50 @@ class ClusterOpsTestCase(test_base.HyperVBaseTestCase):
        c_handler = self.clusterops._serial_console_ops.start_console_handler
        c_handler.assert_called_once_with(mock.sentinel.instance_name)

+    @mock.patch.object(clusterops.ClusterOps, '_wait_for_pending_instance')
+    @mock.patch.object(clusterops.ClusterOps, '_failover_migrate_networks')
+    @mock.patch.object(clusterops.ClusterOps, '_nova_failover_server')
+    @mock.patch.object(clusterops.ClusterOps, '_get_instance_by_name')
+    def test_failover_same_node(self, mock_get_instance_by_name,
+                                mock_nova_failover_server,
+                                mock_failover_migrate_networks,
+                                mock_wait_pending_instance):
+        # In some cases, the instances may bounce between hosts. We're testing
+        # the case in which the instance is actually returning to the initial
+        # host during the time in which we're processing events.
+        instance = mock_get_instance_by_name.return_value
+        old_host = 'old_host'
+        new_host = 'new_host'
+        instance.host = old_host
+        self.clusterops._this_node = old_host
+        self._clustutils.get_vm_host.return_value = old_host
+
+        self.clusterops._failover_migrate(mock.sentinel.instance_name,
+                                          new_host)
+
+        get_inst_nw_info = self.clusterops._network_api.get_instance_nw_info
+        get_inst_nw_info.assert_called_once_with(self.clusterops._context,
+                                                 instance)
+        mock_nova_failover_server.assert_called_once_with(instance, old_host)
+        self.clusterops._vmops.unplug_vifs.assert_not_called()
+        self.clusterops._vmops.plug_vifs.assert_called_once_with(
+            instance, get_inst_nw_info.return_value)
+        mock_failover_migrate_networks.assert_not_called()
+        c_handler = self.clusterops._serial_console_ops.start_console_handler
+        c_handler.assert_called_once_with(mock.sentinel.instance_name)
+
+    @mock.patch('time.sleep')
+    def test_wait_for_pending_instance(self, mock_sleep):
+        self._clustutils.get_cluster_group_state_info.side_effect = [
+            dict(state=os_win_const.CLUSTER_GROUP_PENDING),
+            dict(state=os_win_const.CLUSTER_GROUP_ONLINE)]
+
+        self.clusterops._wait_for_pending_instance(mock.sentinel.instance_name)
+
+        self._clustutils.get_cluster_group_state_info.assert_has_calls(
+            [mock.call(mock.sentinel.instance_name)] * 2)
+        mock_sleep.assert_called_once_with(2)
+
    def test_failover_migrate_networks(self):
        mock_instance = fake_instance.fake_instance_obj(self.context)
        fake_source = mock.MagicMock()
--- a/compute_hyperv/tests/unit/test_coordination.py
+++ b/compute_hyperv/tests/unit/test_coordination.py
@ -50,6 +50,8 @@ class MockToozLock(tooz.locking.Lock):

@mock.patch('tooz.coordination.get_coordinator')
 class CoordinatorTestCase(test_base.HyperVBaseTestCase):
+    MOCK_TOOZ = False
+
    def test_coordinator_start(self, get_coordinator):
        crd = get_coordinator.return_value

--- a/releasenotes/notes/cluster-distributed-locks-5f12252af6b3913b.yaml
+++ b/releasenotes/notes/cluster-distributed-locks-5f12252af6b3913b.yaml
@ -0,0 +1,11 @@
+---
+upgrade:
+  - |
+    When using the cluster driver, a distributed lock backend will have to be
+    configured.
+fixes:
+  - |
+    In order to fix race conditions that can occur when handling instance
+    failovers, the cluster driver is now using distributed locks. A
+    distributed lock backend (e.g. etcd, mysql, file based, etc) will have to
+    be configured.