From 19cb8280232fd3b0ba0000a475d061ea9fb10e1a Mon Sep 17 00:00:00 2001 From: Jim Rollenhagen Date: Wed, 13 Feb 2019 12:59:53 -0500 Subject: [PATCH] ironic: check fresh data when sync_power_state doesn't line up We return cached data to sync_power_state to avoid pummeling the ironic API. However, this can lead to a race condition where an instance is powered on, but nova thinks it should be off and calls stop(). Check again without the cache when this happens to make sure we don't unnecessarily kill an instance. Closes-Bug: #1815791 Change-Id: I907b69eb689cf6c169a4869cfc7889308ca419d5 --- nova/compute/manager.py | 75 ++++++++++++------- nova/tests/unit/compute/test_compute_mgr.py | 23 +++++- nova/tests/unit/virt/ironic/test_driver.py | 28 +++++++ nova/virt/driver.py | 7 +- nova/virt/fake.py | 2 +- nova/virt/hyperv/driver.py | 2 +- nova/virt/ironic/driver.py | 23 ++++-- nova/virt/libvirt/driver.py | 3 +- nova/virt/powervm/driver.py | 3 +- nova/virt/vmwareapi/driver.py | 2 +- nova/virt/xenapi/driver.py | 2 +- nova/virt/zvm/driver.py | 2 +- .../notes/bug-1815791-f84a913eef9e3b21.yaml | 11 +++ 13 files changed, 140 insertions(+), 43 deletions(-) create mode 100644 releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index d3504b8fccf3..c14476764629 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -7541,6 +7541,53 @@ class ComputeManager(manager.Manager): # silently ignore. pass + def _stop_unexpected_shutdown_instance(self, context, vm_state, + db_instance, orig_db_power_state, + vm_power_state): + # this is an exceptional case; make sure our data is up + # to date before slamming through a power off + # TODO(jroll) remove the check for TypeError here; + # it's only here to be able to backport this without + # breaking out-of-tree drivers. + try: + vm_instance = self.driver.get_info(db_instance, + use_cache=False) + vm_power_state = vm_instance.state + except TypeError: + LOG.warning("Your virt driver appears to not support the " + "'use_cache' parameter to the 'get_info' method; " + "please update your virt driver.") + + # if it still looks off, go ahead and call stop() + if vm_power_state in (power_state.SHUTDOWN, + power_state.CRASHED): + + LOG.warning("Instance shutdown by itself. Calling the " + "stop API. Current vm_state: %(vm_state)s, " + "current task_state: %(task_state)s, " + "original DB power_state: %(db_power_state)s, " + "current VM power_state: %(vm_power_state)s", + {'vm_state': vm_state, + 'task_state': db_instance.task_state, + 'db_power_state': orig_db_power_state, + 'vm_power_state': vm_power_state}, + instance=db_instance) + try: + # Note(maoy): here we call the API instead of + # brutally updating the vm_state in the database + # to allow all the hooks and checks to be performed. + if db_instance.shutdown_terminate: + self.compute_api.delete(context, db_instance) + else: + self.compute_api.stop(context, db_instance) + except Exception: + # Note(maoy): there is no need to propagate the error + # because the same power_state will be retrieved next + # time and retried. + # For example, there might be another task scheduled. + LOG.exception("error during stop() in sync_power_state.", + instance=db_instance) + def _sync_instance_power_state(self, context, db_instance, vm_power_state, use_slave=False): """Align instance power state between the database and hypervisor. @@ -7610,31 +7657,9 @@ class ComputeManager(manager.Manager): # The only rational power state should be RUNNING if vm_power_state in (power_state.SHUTDOWN, power_state.CRASHED): - LOG.warning("Instance shutdown by itself. Calling the " - "stop API. Current vm_state: %(vm_state)s, " - "current task_state: %(task_state)s, " - "original DB power_state: %(db_power_state)s, " - "current VM power_state: %(vm_power_state)s", - {'vm_state': vm_state, - 'task_state': db_instance.task_state, - 'db_power_state': orig_db_power_state, - 'vm_power_state': vm_power_state}, - instance=db_instance) - try: - # Note(maoy): here we call the API instead of - # brutally updating the vm_state in the database - # to allow all the hooks and checks to be performed. - if db_instance.shutdown_terminate: - self.compute_api.delete(context, db_instance) - else: - self.compute_api.stop(context, db_instance) - except Exception: - # Note(maoy): there is no need to propagate the error - # because the same power_state will be retrieved next - # time and retried. - # For example, there might be another task scheduled. - LOG.exception("error during stop() in sync_power_state.", - instance=db_instance) + self._stop_unexpected_shutdown_instance( + context, vm_state, db_instance, orig_db_power_state, + vm_power_state) elif vm_power_state == power_state.SUSPENDED: LOG.warning("Instance is suspended unexpectedly. Calling " "the stop API.", instance=db_instance) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 327afa91108e..492b7a6dd64d 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -1791,10 +1791,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, power_state.RUNNING) mock_refresh.assert_called_once_with(use_slave=False) + @mock.patch.object(fake_driver.FakeDriver, 'get_info') @mock.patch.object(objects.Instance, 'refresh') @mock.patch.object(objects.Instance, 'save') def test_sync_instance_power_state_running_stopped(self, mock_save, - mock_refresh): + mock_refresh, + mock_get_info): + mock_get_info.return_value = hardware.InstanceInfo( + state=power_state.SHUTDOWN) instance = self._get_sync_instance(power_state.RUNNING, vm_states.ACTIVE) self.compute._sync_instance_power_state(self.context, instance, @@ -1802,11 +1806,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.assertEqual(instance.power_state, power_state.SHUTDOWN) mock_refresh.assert_called_once_with(use_slave=False) self.assertTrue(mock_save.called) + mock_get_info.assert_called_once_with(instance, use_cache=False) - def _test_sync_to_stop(self, power_state, vm_state, driver_power_state, + def _test_sync_to_stop(self, vm_power_state, vm_state, driver_power_state, stop=True, force=False, shutdown_terminate=False): instance = self._get_sync_instance( - power_state, vm_state, shutdown_terminate=shutdown_terminate) + vm_power_state, vm_state, shutdown_terminate=shutdown_terminate) with test.nested( mock.patch.object(objects.Instance, 'refresh'), @@ -1814,7 +1819,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock.patch.object(self.compute.compute_api, 'stop'), mock.patch.object(self.compute.compute_api, 'delete'), mock.patch.object(self.compute.compute_api, 'force_stop'), - ) as (mock_refresh, mock_save, mock_stop, mock_delete, mock_force): + mock.patch.object(self.compute.driver, 'get_info') + ) as (mock_refresh, mock_save, mock_stop, mock_delete, mock_force, + mock_get_info): + mock_get_info.return_value = hardware.InstanceInfo( + state=driver_power_state) + self.compute._sync_instance_power_state(self.context, instance, driver_power_state) if shutdown_terminate: @@ -1824,6 +1834,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, mock_force.assert_called_once_with(self.context, instance) else: mock_stop.assert_called_once_with(self.context, instance) + if (vm_state == vm_states.ACTIVE and + vm_power_state in (power_state.SHUTDOWN, + power_state.CRASHED)): + mock_get_info.assert_called_once_with(instance, + use_cache=False) mock_refresh.assert_called_once_with(use_slave=False) self.assertTrue(mock_save.called) diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py index 976474cd89fc..77358432edc5 100644 --- a/nova/tests/unit/virt/ironic/test_driver.py +++ b/nova/tests/unit/virt/ironic/test_driver.py @@ -1100,6 +1100,34 @@ class IronicDriverTestCase(test.NoDBTestCase): mock_gbiu.assert_called_once_with(instance.uuid, fields=ironic_driver._NODE_FIELDS) + @mock.patch.object(ironic_driver.IronicDriver, '_get_node_list') + @mock.patch.object(objects.InstanceList, 'get_uuids_by_host') + @mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type') + @mock.patch.object(FAKE_CLIENT.node, 'get_by_instance_uuid') + def test_get_info_skip_cache(self, mock_gbiu, mock_svc_by_hv, + mock_uuids_by_host, mock_get_node_list): + properties = {'memory_mb': 512, 'cpus': 2} + power_state = ironic_states.POWER_ON + node = _get_cached_node( + instance_uuid=self.instance_uuid, properties=properties, + power_state=power_state) + + mock_gbiu.return_value = node + mock_svc_by_hv.return_value = [] + mock_get_node_list.return_value = [node] + + # ironic_states.POWER_ON should be mapped to + # nova_states.RUNNING + instance = fake_instance.fake_instance_obj('fake-context', + uuid=self.instance_uuid) + mock_uuids_by_host.return_value = [instance.uuid] + result = self.driver.get_info(instance, use_cache=False) + self.assertEqual(hardware.InstanceInfo(state=nova_states.RUNNING), + result) + # verify we hit the ironic API for fresh data + mock_gbiu.assert_called_once_with(instance.uuid, + fields=ironic_driver._NODE_FIELDS) + @mock.patch.object(ironic_driver.IronicDriver, '_get_node_list') @mock.patch.object(objects.InstanceList, 'get_uuids_by_host') @mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type') diff --git a/nova/virt/driver.py b/nova/virt/driver.py index f63652cb7515..6e939cf24c0d 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -157,10 +157,15 @@ class ComputeDriver(object): """ pass - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Get the current status of an instance. :param instance: nova.objects.instance.Instance object + :param use_cache: boolean to indicate if the driver should be allowed + to use cached data to return instance status. + This only applies to drivers which cache instance + state information. For drivers that do not use a + cache, this parameter can be ignored. :returns: An InstanceInfo object """ # TODO(Vek): Need to pass context in for access to auth_token diff --git a/nova/virt/fake.py b/nova/virt/fake.py index 6b0f59b11783..4f3abe9a5b91 100644 --- a/nova/virt/fake.py +++ b/nova/virt/fake.py @@ -350,7 +350,7 @@ class FakeDriver(driver.ComputeDriver): raise exception.InterfaceDetachFailed( instance_uuid=instance.uuid) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): if instance.uuid not in self.instances: raise exception.InstanceNotFound(instance_id=instance.uuid) i = self.instances[instance.uuid] diff --git a/nova/virt/hyperv/driver.py b/nova/virt/hyperv/driver.py index a228bbd1cee8..f91f8d724285 100644 --- a/nova/virt/hyperv/driver.py +++ b/nova/virt/hyperv/driver.py @@ -176,7 +176,7 @@ class HyperVDriver(driver.ComputeDriver): """Cleanup after instance being destroyed by Hypervisor.""" self.unplug_vifs(instance, network_info) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): return self._vmops.get_info(instance) def attach_volume(self, context, connection_info, instance, mountpoint, diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py index 4aecc864d893..15f95f0c6ef0 100644 --- a/nova/virt/ironic/driver.py +++ b/nova/virt/ironic/driver.py @@ -877,15 +877,30 @@ class IronicDriver(virt_driver.ComputeDriver): return node return _sync_node_from_cache() - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Get the current state and resource usage for this instance. If the instance is not found this method returns (a dictionary with) NOSTATE and all resources == 0. :param instance: the instance object. + :param use_cache: boolean to indicate if the driver should be allowed + to use cached data to return instance status. + If false, pull fresh data from ironic. :returns: an InstanceInfo object """ + def _fetch_from_ironic(self, instance): + try: + node = self._validate_instance_and_node(instance) + return hardware.InstanceInfo( + state=map_power_state(node.power_state)) + except exception.InstanceNotFound: + return hardware.InstanceInfo( + state=map_power_state(ironic_states.NOSTATE)) + + if not use_cache: + return _fetch_from_ironic(self, instance) + # we should already have a cache for our nodes, refreshed on every # RT loop. but if we don't have a cache, generate it. if not self.node_cache: @@ -896,11 +911,7 @@ class IronicDriver(virt_driver.ComputeDriver): break else: # if we can't find the instance, fall back to ironic - try: - node = self._validate_instance_and_node(instance) - except exception.InstanceNotFound: - return hardware.InstanceInfo( - state=map_power_state(ironic_states.NOSTATE)) + return _fetch_from_ironic(self, instance) return hardware.InstanceInfo(state=map_power_state(node.power_state)) diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 6d1603373ed0..9b1a10cc7ac5 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -5525,7 +5525,7 @@ class LibvirtDriver(driver.ComputeDriver): {'xml': xml}, instance=instance) return xml - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Retrieve information from libvirt for a specific instance. If a libvirt error is encountered during lookup, we might raise a @@ -5533,6 +5533,7 @@ class LibvirtDriver(driver.ComputeDriver): libvirt error is. :param instance: nova.objects.instance.Instance object + :param use_cache: unused in this driver :returns: An InstanceInfo object """ guest = self._host.get_guest(instance) diff --git a/nova/virt/powervm/driver.py b/nova/virt/powervm/driver.py index c3b3939dda79..3f65ba8608b9 100644 --- a/nova/virt/powervm/driver.py +++ b/nova/virt/powervm/driver.py @@ -116,10 +116,11 @@ class PowerVMDriver(driver.ComputeDriver): {'op': op, 'display_name': instance.display_name, 'name': instance.name}, instance=instance) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Get the current status of an instance. :param instance: nova.objects.instance.Instance object + :param use_cache: unused in this driver :returns: An InstanceInfo object. """ return vm.get_vm_info(self.adapter, instance) diff --git a/nova/virt/vmwareapi/driver.py b/nova/virt/vmwareapi/driver.py index dba10d2e2097..7ec719d347e6 100644 --- a/nova/virt/vmwareapi/driver.py +++ b/nova/virt/vmwareapi/driver.py @@ -583,7 +583,7 @@ class VMwareVCDriver(driver.ComputeDriver): """Poll for rebooting instances.""" self._vmops.poll_rebooting_instances(timeout, instances) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Return info about the VM instance.""" return self._vmops.get_info(instance) diff --git a/nova/virt/xenapi/driver.py b/nova/virt/xenapi/driver.py index 8ae3de362195..dbe19252e20d 100644 --- a/nova/virt/xenapi/driver.py +++ b/nova/virt/xenapi/driver.py @@ -369,7 +369,7 @@ class XenAPIDriver(driver.ComputeDriver): """Unplug VIFs from networks.""" self._vmops.unplug_vifs(instance, network_info) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): """Return data about VM instance.""" return self._vmops.get_info(instance) diff --git a/nova/virt/zvm/driver.py b/nova/virt/zvm/driver.py index 8405f276bc4d..2d5d5846fc75 100644 --- a/nova/virt/zvm/driver.py +++ b/nova/virt/zvm/driver.py @@ -122,7 +122,7 @@ class ZVMDriver(driver.ComputeDriver): def get_available_nodes(self, refresh=False): return self._hypervisor.get_available_nodes(refresh=refresh) - def get_info(self, instance): + def get_info(self, instance, use_cache=True): _guest = guest.Guest(self._hypervisor, instance) return _guest.get_info() diff --git a/releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml b/releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml new file mode 100644 index 000000000000..f52352506b1e --- /dev/null +++ b/releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml @@ -0,0 +1,11 @@ +--- +fixes: + - | + Fixes a race condition that could allow a newly created Ironic + instance to be powered off after deployment, without letting + the user power it back on. +upgrade: + - | + Adds a ``use_cache`` parameter to the virt driver ``get_info`` + method. Out of tree drivers should add support for this + parameter.