ironic: check fresh data when sync_power_state doesn't line up
We return cached data to sync_power_state to avoid pummeling the ironic API. However, this can lead to a race condition where an instance is powered on, but nova thinks it should be off and calls stop(). Check again without the cache when this happens to make sure we don't unnecessarily kill an instance. Closes-Bug: #1815791 Change-Id: I907b69eb689cf6c169a4869cfc7889308ca419d5
This commit is contained in:
parent
104d79d8dd
commit
19cb828023
@ -7541,6 +7541,53 @@ class ComputeManager(manager.Manager):
|
||||
# silently ignore.
|
||||
pass
|
||||
|
||||
def _stop_unexpected_shutdown_instance(self, context, vm_state,
|
||||
db_instance, orig_db_power_state,
|
||||
vm_power_state):
|
||||
# this is an exceptional case; make sure our data is up
|
||||
# to date before slamming through a power off
|
||||
# TODO(jroll) remove the check for TypeError here;
|
||||
# it's only here to be able to backport this without
|
||||
# breaking out-of-tree drivers.
|
||||
try:
|
||||
vm_instance = self.driver.get_info(db_instance,
|
||||
use_cache=False)
|
||||
vm_power_state = vm_instance.state
|
||||
except TypeError:
|
||||
LOG.warning("Your virt driver appears to not support the "
|
||||
"'use_cache' parameter to the 'get_info' method; "
|
||||
"please update your virt driver.")
|
||||
|
||||
# if it still looks off, go ahead and call stop()
|
||||
if vm_power_state in (power_state.SHUTDOWN,
|
||||
power_state.CRASHED):
|
||||
|
||||
LOG.warning("Instance shutdown by itself. Calling the "
|
||||
"stop API. Current vm_state: %(vm_state)s, "
|
||||
"current task_state: %(task_state)s, "
|
||||
"original DB power_state: %(db_power_state)s, "
|
||||
"current VM power_state: %(vm_power_state)s",
|
||||
{'vm_state': vm_state,
|
||||
'task_state': db_instance.task_state,
|
||||
'db_power_state': orig_db_power_state,
|
||||
'vm_power_state': vm_power_state},
|
||||
instance=db_instance)
|
||||
try:
|
||||
# Note(maoy): here we call the API instead of
|
||||
# brutally updating the vm_state in the database
|
||||
# to allow all the hooks and checks to be performed.
|
||||
if db_instance.shutdown_terminate:
|
||||
self.compute_api.delete(context, db_instance)
|
||||
else:
|
||||
self.compute_api.stop(context, db_instance)
|
||||
except Exception:
|
||||
# Note(maoy): there is no need to propagate the error
|
||||
# because the same power_state will be retrieved next
|
||||
# time and retried.
|
||||
# For example, there might be another task scheduled.
|
||||
LOG.exception("error during stop() in sync_power_state.",
|
||||
instance=db_instance)
|
||||
|
||||
def _sync_instance_power_state(self, context, db_instance, vm_power_state,
|
||||
use_slave=False):
|
||||
"""Align instance power state between the database and hypervisor.
|
||||
@ -7610,31 +7657,9 @@ class ComputeManager(manager.Manager):
|
||||
# The only rational power state should be RUNNING
|
||||
if vm_power_state in (power_state.SHUTDOWN,
|
||||
power_state.CRASHED):
|
||||
LOG.warning("Instance shutdown by itself. Calling the "
|
||||
"stop API. Current vm_state: %(vm_state)s, "
|
||||
"current task_state: %(task_state)s, "
|
||||
"original DB power_state: %(db_power_state)s, "
|
||||
"current VM power_state: %(vm_power_state)s",
|
||||
{'vm_state': vm_state,
|
||||
'task_state': db_instance.task_state,
|
||||
'db_power_state': orig_db_power_state,
|
||||
'vm_power_state': vm_power_state},
|
||||
instance=db_instance)
|
||||
try:
|
||||
# Note(maoy): here we call the API instead of
|
||||
# brutally updating the vm_state in the database
|
||||
# to allow all the hooks and checks to be performed.
|
||||
if db_instance.shutdown_terminate:
|
||||
self.compute_api.delete(context, db_instance)
|
||||
else:
|
||||
self.compute_api.stop(context, db_instance)
|
||||
except Exception:
|
||||
# Note(maoy): there is no need to propagate the error
|
||||
# because the same power_state will be retrieved next
|
||||
# time and retried.
|
||||
# For example, there might be another task scheduled.
|
||||
LOG.exception("error during stop() in sync_power_state.",
|
||||
instance=db_instance)
|
||||
self._stop_unexpected_shutdown_instance(
|
||||
context, vm_state, db_instance, orig_db_power_state,
|
||||
vm_power_state)
|
||||
elif vm_power_state == power_state.SUSPENDED:
|
||||
LOG.warning("Instance is suspended unexpectedly. Calling "
|
||||
"the stop API.", instance=db_instance)
|
||||
|
@ -1791,10 +1791,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
power_state.RUNNING)
|
||||
mock_refresh.assert_called_once_with(use_slave=False)
|
||||
|
||||
@mock.patch.object(fake_driver.FakeDriver, 'get_info')
|
||||
@mock.patch.object(objects.Instance, 'refresh')
|
||||
@mock.patch.object(objects.Instance, 'save')
|
||||
def test_sync_instance_power_state_running_stopped(self, mock_save,
|
||||
mock_refresh):
|
||||
mock_refresh,
|
||||
mock_get_info):
|
||||
mock_get_info.return_value = hardware.InstanceInfo(
|
||||
state=power_state.SHUTDOWN)
|
||||
instance = self._get_sync_instance(power_state.RUNNING,
|
||||
vm_states.ACTIVE)
|
||||
self.compute._sync_instance_power_state(self.context, instance,
|
||||
@ -1802,11 +1806,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
self.assertEqual(instance.power_state, power_state.SHUTDOWN)
|
||||
mock_refresh.assert_called_once_with(use_slave=False)
|
||||
self.assertTrue(mock_save.called)
|
||||
mock_get_info.assert_called_once_with(instance, use_cache=False)
|
||||
|
||||
def _test_sync_to_stop(self, power_state, vm_state, driver_power_state,
|
||||
def _test_sync_to_stop(self, vm_power_state, vm_state, driver_power_state,
|
||||
stop=True, force=False, shutdown_terminate=False):
|
||||
instance = self._get_sync_instance(
|
||||
power_state, vm_state, shutdown_terminate=shutdown_terminate)
|
||||
vm_power_state, vm_state, shutdown_terminate=shutdown_terminate)
|
||||
|
||||
with test.nested(
|
||||
mock.patch.object(objects.Instance, 'refresh'),
|
||||
@ -1814,7 +1819,12 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
mock.patch.object(self.compute.compute_api, 'stop'),
|
||||
mock.patch.object(self.compute.compute_api, 'delete'),
|
||||
mock.patch.object(self.compute.compute_api, 'force_stop'),
|
||||
) as (mock_refresh, mock_save, mock_stop, mock_delete, mock_force):
|
||||
mock.patch.object(self.compute.driver, 'get_info')
|
||||
) as (mock_refresh, mock_save, mock_stop, mock_delete, mock_force,
|
||||
mock_get_info):
|
||||
mock_get_info.return_value = hardware.InstanceInfo(
|
||||
state=driver_power_state)
|
||||
|
||||
self.compute._sync_instance_power_state(self.context, instance,
|
||||
driver_power_state)
|
||||
if shutdown_terminate:
|
||||
@ -1824,6 +1834,11 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
|
||||
mock_force.assert_called_once_with(self.context, instance)
|
||||
else:
|
||||
mock_stop.assert_called_once_with(self.context, instance)
|
||||
if (vm_state == vm_states.ACTIVE and
|
||||
vm_power_state in (power_state.SHUTDOWN,
|
||||
power_state.CRASHED)):
|
||||
mock_get_info.assert_called_once_with(instance,
|
||||
use_cache=False)
|
||||
mock_refresh.assert_called_once_with(use_slave=False)
|
||||
self.assertTrue(mock_save.called)
|
||||
|
||||
|
@ -1100,6 +1100,34 @@ class IronicDriverTestCase(test.NoDBTestCase):
|
||||
mock_gbiu.assert_called_once_with(instance.uuid,
|
||||
fields=ironic_driver._NODE_FIELDS)
|
||||
|
||||
@mock.patch.object(ironic_driver.IronicDriver, '_get_node_list')
|
||||
@mock.patch.object(objects.InstanceList, 'get_uuids_by_host')
|
||||
@mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type')
|
||||
@mock.patch.object(FAKE_CLIENT.node, 'get_by_instance_uuid')
|
||||
def test_get_info_skip_cache(self, mock_gbiu, mock_svc_by_hv,
|
||||
mock_uuids_by_host, mock_get_node_list):
|
||||
properties = {'memory_mb': 512, 'cpus': 2}
|
||||
power_state = ironic_states.POWER_ON
|
||||
node = _get_cached_node(
|
||||
instance_uuid=self.instance_uuid, properties=properties,
|
||||
power_state=power_state)
|
||||
|
||||
mock_gbiu.return_value = node
|
||||
mock_svc_by_hv.return_value = []
|
||||
mock_get_node_list.return_value = [node]
|
||||
|
||||
# ironic_states.POWER_ON should be mapped to
|
||||
# nova_states.RUNNING
|
||||
instance = fake_instance.fake_instance_obj('fake-context',
|
||||
uuid=self.instance_uuid)
|
||||
mock_uuids_by_host.return_value = [instance.uuid]
|
||||
result = self.driver.get_info(instance, use_cache=False)
|
||||
self.assertEqual(hardware.InstanceInfo(state=nova_states.RUNNING),
|
||||
result)
|
||||
# verify we hit the ironic API for fresh data
|
||||
mock_gbiu.assert_called_once_with(instance.uuid,
|
||||
fields=ironic_driver._NODE_FIELDS)
|
||||
|
||||
@mock.patch.object(ironic_driver.IronicDriver, '_get_node_list')
|
||||
@mock.patch.object(objects.InstanceList, 'get_uuids_by_host')
|
||||
@mock.patch.object(objects.ServiceList, 'get_all_computes_by_hv_type')
|
||||
|
@ -157,10 +157,15 @@ class ComputeDriver(object):
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
"""Get the current status of an instance.
|
||||
|
||||
:param instance: nova.objects.instance.Instance object
|
||||
:param use_cache: boolean to indicate if the driver should be allowed
|
||||
to use cached data to return instance status.
|
||||
This only applies to drivers which cache instance
|
||||
state information. For drivers that do not use a
|
||||
cache, this parameter can be ignored.
|
||||
:returns: An InstanceInfo object
|
||||
"""
|
||||
# TODO(Vek): Need to pass context in for access to auth_token
|
||||
|
@ -350,7 +350,7 @@ class FakeDriver(driver.ComputeDriver):
|
||||
raise exception.InterfaceDetachFailed(
|
||||
instance_uuid=instance.uuid)
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
if instance.uuid not in self.instances:
|
||||
raise exception.InstanceNotFound(instance_id=instance.uuid)
|
||||
i = self.instances[instance.uuid]
|
||||
|
@ -176,7 +176,7 @@ class HyperVDriver(driver.ComputeDriver):
|
||||
"""Cleanup after instance being destroyed by Hypervisor."""
|
||||
self.unplug_vifs(instance, network_info)
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
return self._vmops.get_info(instance)
|
||||
|
||||
def attach_volume(self, context, connection_info, instance, mountpoint,
|
||||
|
@ -877,15 +877,30 @@ class IronicDriver(virt_driver.ComputeDriver):
|
||||
return node
|
||||
return _sync_node_from_cache()
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
"""Get the current state and resource usage for this instance.
|
||||
|
||||
If the instance is not found this method returns (a dictionary
|
||||
with) NOSTATE and all resources == 0.
|
||||
|
||||
:param instance: the instance object.
|
||||
:param use_cache: boolean to indicate if the driver should be allowed
|
||||
to use cached data to return instance status.
|
||||
If false, pull fresh data from ironic.
|
||||
:returns: an InstanceInfo object
|
||||
"""
|
||||
def _fetch_from_ironic(self, instance):
|
||||
try:
|
||||
node = self._validate_instance_and_node(instance)
|
||||
return hardware.InstanceInfo(
|
||||
state=map_power_state(node.power_state))
|
||||
except exception.InstanceNotFound:
|
||||
return hardware.InstanceInfo(
|
||||
state=map_power_state(ironic_states.NOSTATE))
|
||||
|
||||
if not use_cache:
|
||||
return _fetch_from_ironic(self, instance)
|
||||
|
||||
# we should already have a cache for our nodes, refreshed on every
|
||||
# RT loop. but if we don't have a cache, generate it.
|
||||
if not self.node_cache:
|
||||
@ -896,11 +911,7 @@ class IronicDriver(virt_driver.ComputeDriver):
|
||||
break
|
||||
else:
|
||||
# if we can't find the instance, fall back to ironic
|
||||
try:
|
||||
node = self._validate_instance_and_node(instance)
|
||||
except exception.InstanceNotFound:
|
||||
return hardware.InstanceInfo(
|
||||
state=map_power_state(ironic_states.NOSTATE))
|
||||
return _fetch_from_ironic(self, instance)
|
||||
|
||||
return hardware.InstanceInfo(state=map_power_state(node.power_state))
|
||||
|
||||
|
@ -5525,7 +5525,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
{'xml': xml}, instance=instance)
|
||||
return xml
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
"""Retrieve information from libvirt for a specific instance.
|
||||
|
||||
If a libvirt error is encountered during lookup, we might raise a
|
||||
@ -5533,6 +5533,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
libvirt error is.
|
||||
|
||||
:param instance: nova.objects.instance.Instance object
|
||||
:param use_cache: unused in this driver
|
||||
:returns: An InstanceInfo object
|
||||
"""
|
||||
guest = self._host.get_guest(instance)
|
||||
|
@ -116,10 +116,11 @@ class PowerVMDriver(driver.ComputeDriver):
|
||||
{'op': op, 'display_name': instance.display_name,
|
||||
'name': instance.name}, instance=instance)
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
"""Get the current status of an instance.
|
||||
|
||||
:param instance: nova.objects.instance.Instance object
|
||||
:param use_cache: unused in this driver
|
||||
:returns: An InstanceInfo object.
|
||||
"""
|
||||
return vm.get_vm_info(self.adapter, instance)
|
||||
|
@ -583,7 +583,7 @@ class VMwareVCDriver(driver.ComputeDriver):
|
||||
"""Poll for rebooting instances."""
|
||||
self._vmops.poll_rebooting_instances(timeout, instances)
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
"""Return info about the VM instance."""
|
||||
return self._vmops.get_info(instance)
|
||||
|
||||
|
@ -369,7 +369,7 @@ class XenAPIDriver(driver.ComputeDriver):
|
||||
"""Unplug VIFs from networks."""
|
||||
self._vmops.unplug_vifs(instance, network_info)
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
"""Return data about VM instance."""
|
||||
return self._vmops.get_info(instance)
|
||||
|
||||
|
@ -122,7 +122,7 @@ class ZVMDriver(driver.ComputeDriver):
|
||||
def get_available_nodes(self, refresh=False):
|
||||
return self._hypervisor.get_available_nodes(refresh=refresh)
|
||||
|
||||
def get_info(self, instance):
|
||||
def get_info(self, instance, use_cache=True):
|
||||
_guest = guest.Guest(self._hypervisor, instance)
|
||||
return _guest.get_info()
|
||||
|
||||
|
11
releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml
Normal file
11
releasenotes/notes/bug-1815791-f84a913eef9e3b21.yaml
Normal file
@ -0,0 +1,11 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
Fixes a race condition that could allow a newly created Ironic
|
||||
instance to be powered off after deployment, without letting
|
||||
the user power it back on.
|
||||
upgrade:
|
||||
- |
|
||||
Adds a ``use_cache`` parameter to the virt driver ``get_info``
|
||||
method. Out of tree drivers should add support for this
|
||||
parameter.
|
Loading…
Reference in New Issue
Block a user