Recover from REBOOT-* state on compute manager start-up

If a compute manager is stopped / fails during certain REBOOT-*
operations then the instance will be left stuck in a transitional
task_state

This change handles two possible task states REBOOT_PENDING and
REBOOT_STARTED (for both soft and hard reboots). And either clears
these states or retries the reboot depending on the instance state.

Both task states are set after the request has gotten to the compute
manager so we can handle these safely knowing the operation has
ended/failed with the restart of the compute manager.

We retry the reboot where the state is PENDING and where the state is
STARTED and the instance is not running.

Where the instance is running and the state is STARTED we simply
transition the instance to an ACTIVE state. The user can retry the
reboot if required.

Related to blueprint recover-stuck-state

Change-Id: Ib318b3d444a67616441302f3daa8de20ee946f3f
This commit is contained in:
David McNally 2013-11-22 16:18:53 +00:00
parent 5137045db2
commit cc0be157d0
12 changed files with 327 additions and 19 deletions

View File

@ -59,7 +59,11 @@ _STATE_MAP = {
vm_states.ACTIVE: {
'default': 'ACTIVE',
task_states.REBOOTING: 'REBOOT',
task_states.REBOOT_PENDING: 'REBOOT',
task_states.REBOOT_STARTED: 'REBOOT',
task_states.REBOOTING_HARD: 'HARD_REBOOT',
task_states.REBOOT_PENDING_HARD: 'HARD_REBOOT',
task_states.REBOOT_STARTED_HARD: 'HARD_REBOOT',
task_states.UPDATING_PASSWORD: 'PASSWORD',
task_states.REBUILDING: 'REBUILD',
task_states.REBUILD_BLOCK_DEVICE_MAPPING: 'REBUILD',

View File

@ -2050,9 +2050,8 @@ class API(base.Base):
@wrap_check_policy
@check_instance_lock
@check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
vm_states.PAUSED, vm_states.SUSPENDED,
vm_states.ERROR],
@check_instance_state(vm_state=set(
vm_states.ALLOW_SOFT_REBOOT + vm_states.ALLOW_HARD_REBOOT),
task_state=[None, task_states.REBOOTING,
task_states.REBOOTING_HARD,
task_states.RESUMING,
@ -2062,10 +2061,7 @@ class API(base.Base):
def reboot(self, context, instance, reboot_type):
"""Reboot the given instance."""
if (reboot_type == 'SOFT' and
(instance['vm_state'] in [vm_states.STOPPED,
vm_states.PAUSED,
vm_states.SUSPENDED,
vm_states.ERROR])):
(instance['vm_state'] not in vm_states.ALLOW_SOFT_REBOOT)):
raise exception.InstanceInvalidState(
attr='vm_state',
instance_uuid=instance['uuid'],

View File

@ -848,6 +848,34 @@ class ComputeManager(manager.Manager):
finally:
return
try_reboot, reboot_type = self._retry_reboot(context, instance)
current_power_state = self._get_power_state(context, instance)
if try_reboot:
LOG.debug(_("Instance in transitional state (%(task_state)s) at "
"start-up and power state is (%(power_state)s), "
"triggering reboot"),
{'task_state': instance['task_state'],
'power_state': current_power_state},
instance=instance)
self.compute_rpcapi.reboot_instance(context, instance,
block_device_info=None,
reboot_type=reboot_type)
return
elif (current_power_state == power_state.RUNNING and
instance.task_state in [task_states.REBOOT_STARTED,
task_states.REBOOT_STARTED_HARD]):
LOG.warning(_("Instance in transitional state "
"(%(task_state)s) at start-up and power state "
"is (%(power_state)s), clearing task state"),
{'task_state': instance['task_state'],
'power_state': current_power_state},
instance=instance)
instance = self._instance_update(context, instance.uuid,
vm_state=vm_states.ACTIVE,
task_state=None)
net_info = compute_utils.get_nw_info_for_instance(instance)
try:
self.driver.plug_vifs(instance, net_info)
@ -917,6 +945,27 @@ class ComputeManager(manager.Manager):
LOG.warning(_('Hypervisor driver does not support '
'firewall rules'), instance=instance)
def _retry_reboot(self, context, instance):
current_power_state = self._get_power_state(context, instance)
current_task_state = instance.task_state
retry_reboot = False
reboot_type = compute_utils.get_reboot_type(current_task_state,
current_power_state)
pending_soft = (current_task_state == task_states.REBOOT_PENDING and
instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
pending_hard = (current_task_state == task_states.REBOOT_PENDING_HARD
and instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
started_not_running = (current_task_state in
[task_states.REBOOT_STARTED,
task_states.REBOOT_STARTED_HARD] and
current_power_state != power_state.RUNNING)
if pending_soft or pending_hard or started_not_running:
retry_reboot = True
return retry_reboot, reboot_type
def handle_lifecycle_event(self, event):
LOG.info(_("Lifecycle event %(state)d on VM %(uuid)s") %
{'state': event.get_transition(),
@ -2528,6 +2577,17 @@ class ComputeManager(manager.Manager):
def reboot_instance(self, context, instance, block_device_info,
reboot_type):
"""Reboot an instance on this host."""
# acknowledge the request made it to the manager
if reboot_type == "SOFT":
instance.task_state = task_states.REBOOT_PENDING
expected_states = (task_states.REBOOTING,
task_states.REBOOT_PENDING,
task_states.REBOOT_STARTED)
else:
instance.task_state = task_states.REBOOT_PENDING_HARD
expected_states = (task_states.REBOOTING_HARD,
task_states.REBOOT_PENDING_HARD,
task_states.REBOOT_STARTED_HARD)
context = context.elevated()
LOG.audit(_("Rebooting instance"), context=context, instance=instance)
@ -2541,7 +2601,7 @@ class ComputeManager(manager.Manager):
current_power_state = self._get_power_state(context, instance)
instance.power_state = current_power_state
instance.save()
instance.save(expected_task_state=expected_states)
if instance['power_state'] != power_state.RUNNING:
state = instance['power_state']
@ -2562,7 +2622,13 @@ class ComputeManager(manager.Manager):
else:
new_vm_state = vm_states.ACTIVE
new_power_state = None
if reboot_type == "SOFT":
instance.task_state = task_states.REBOOT_STARTED
expected_state = task_states.REBOOT_PENDING
else:
instance.task_state = task_states.REBOOT_STARTED_HARD
expected_state = task_states.REBOOT_PENDING_HARD
instance.save(expected_task_state=expected_state)
self.driver.reboot(context, instance,
network_info,
reboot_type,

View File

@ -55,7 +55,11 @@ RESIZE_CONFIRMING = 'resize_confirming'
# possible task states during reboot()
REBOOTING = 'rebooting'
REBOOT_PENDING = 'reboot_pending'
REBOOT_STARTED = 'reboot_started'
REBOOTING_HARD = 'rebooting_hard'
REBOOT_PENDING_HARD = 'reboot_pending_hard'
REBOOT_STARTED_HARD = 'reboot_started_hard'
# possible task states during pause()
PAUSING = 'pausing'

View File

@ -23,6 +23,8 @@ from oslo.config import cfg
from nova import block_device
from nova.compute import flavors
from nova.compute import power_state
from nova.compute import task_states
from nova import exception
from nova.network import model as network_model
from nova import notifications
@ -448,6 +450,16 @@ def usage_volume_info(vol_usage):
return usage_info
def get_reboot_type(task_state, current_power_state):
"""Checks if the current instance state requires a HARD reboot."""
if current_power_state != power_state.RUNNING:
return 'HARD'
soft_types = [task_states.REBOOT_STARTED, task_states.REBOOT_PENDING,
task_states.REBOOTING]
reboot_type = 'SOFT' if task_state in soft_types else 'HARD'
return reboot_type
class EventReporter(object):
"""Context manager to report instance action events."""

View File

@ -46,3 +46,7 @@ ERROR = 'error'
SHELVED = 'shelved' # VM is powered off, resources still on hypervisor
SHELVED_OFFLOADED = 'shelved_offloaded' # VM and associated resources are
# not on hypervisor
ALLOW_SOFT_REBOOT = [ACTIVE] # states we can soft reboot from
ALLOW_HARD_REBOOT = ALLOW_SOFT_REBOOT + [STOPPED, PAUSED, SUSPENDED, ERROR]
# states we allow hard reboot from

View File

@ -955,7 +955,10 @@ class ServersControllerTest(ControllerTest):
expected_attrs=[]):
self.assertIsNotNone(search_opts)
self.assertIn('task_state', search_opts)
self.assertEqual(search_opts['task_state'], [task_state])
self.assertEqual([task_states.REBOOT_PENDING,
task_states.REBOOT_STARTED,
task_states.REBOOTING],
search_opts['task_state'])
db_list = [fakes.stub_instance(100, uuid=server_uuid,
task_state=task_state)]
return instance_obj._make_instance_list(

View File

@ -876,7 +876,10 @@ class ServersControllerTest(ControllerTest):
limit=None, marker=None, want_objects=False):
self.assertIsNotNone(search_opts)
self.assertIn('task_state', search_opts)
self.assertEqual(search_opts['task_state'], [task_state])
self.assertEqual([task_states.REBOOT_PENDING,
task_states.REBOOT_STARTED,
task_states.REBOOTING],
search_opts['task_state'])
db_list = [fakes.stub_instance(100, uuid=server_uuid,
task_state=task_state)]
return instance_obj._make_instance_list(

View File

@ -360,7 +360,9 @@ class MiscFunctionsTest(test.TestCase):
def test_task_and_vm_state_from_status(self):
fixture1 = 'reboot'
actual = common.task_and_vm_state_from_status(fixture1)
expected = [vm_states.ACTIVE], [task_states.REBOOTING]
expected = [vm_states.ACTIVE], [task_states.REBOOT_PENDING,
task_states.REBOOT_STARTED,
task_states.REBOOTING]
self.assertEqual(expected, actual)
fixture2 = 'resize'

View File

@ -2287,6 +2287,21 @@ class ComputeTestCase(BaseTestCase):
def _test_reboot(self, soft,
test_delete=False, test_unrescue=False,
fail_reboot=False, fail_running=False):
reboot_type = soft and 'SOFT' or 'HARD'
task_pending = (soft and task_states.REBOOT_PENDING
or task_states.REBOOT_PENDING_HARD)
task_started = (soft and task_states.REBOOT_STARTED
or task_states.REBOOT_STARTED_HARD)
expected_task = (soft and task_states.REBOOTING
or task_states.REBOOTING_HARD)
expected_tasks = (soft and (task_states.REBOOTING,
task_states.REBOOT_PENDING,
task_states.REBOOT_STARTED)
or (task_states.REBOOTING_HARD,
task_states.REBOOT_PENDING_HARD,
task_states.REBOOT_STARTED_HARD))
# This is a true unit test, so we don't need the network stubs.
fake_network.unset_stub_network_methods(self.stubs)
@ -2309,6 +2324,7 @@ class ComputeTestCase(BaseTestCase):
**dict(uuid='fake-instance',
power_state=power_state.NOSTATE,
vm_state=vm_states.ACTIVE,
task_state=expected_task,
launched_at=timeutils.utcnow()))
instance = instance_obj.Instance._from_db_object(
econtext, instance_obj.Instance(), db_instance)
@ -2317,11 +2333,13 @@ class ComputeTestCase(BaseTestCase):
**dict(uuid='updated-instance1',
power_state=10003,
vm_state=vm_states.ACTIVE,
task_state=expected_task,
launched_at=timeutils.utcnow()))
updated_dbinstance2 = fake_instance.fake_db_instance(
**dict(uuid='updated-instance2',
power_state=10003,
vm_state=vm_states.ACTIVE,
task_state=expected_task,
launched_at=timeutils.utcnow()))
if test_unrescue:
@ -2334,7 +2352,6 @@ class ComputeTestCase(BaseTestCase):
fake_power_state1 = 10001
fake_power_state2 = power_state.RUNNING
fake_power_state3 = 10002
reboot_type = soft and 'SOFT' or 'HARD'
# Beginning of calls we expect.
@ -2352,13 +2369,22 @@ class ComputeTestCase(BaseTestCase):
self.compute._get_power_state(econtext,
instance).AndReturn(fake_power_state1)
db.instance_update_and_get_original(econtext, instance['uuid'],
{'power_state': fake_power_state1},
update_cells=False,
columns_to_join=['system_metadata']
).AndReturn((None,
updated_dbinstance1))
{'task_state': task_pending,
'expected_task_state': expected_tasks,
'power_state': fake_power_state1},
update_cells=False,
columns_to_join=['system_metadata']
).AndReturn((None,
updated_dbinstance1))
expected_nw_info = fake_nw_model
db.instance_update_and_get_original(econtext,
updated_dbinstance1['uuid'],
{'task_state': task_started,
'expected_task_state': task_pending},
update_cells=False,
columns_to_join=['system_metadata']
).AndReturn((None,
updated_dbinstance1))
# Annoying. driver.reboot is wrapped in a try/except, and
# doesn't re-raise. It eats exception generated by mox if

View File

@ -274,6 +274,10 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
'_set_instance_error_state')
self.compute._get_power_state(mox.IgnoreArg(),
instance).AndReturn(power_state.SHUTDOWN)
self.compute._get_power_state(mox.IgnoreArg(),
instance).AndReturn(power_state.SHUTDOWN)
self.compute._get_power_state(mox.IgnoreArg(),
instance).AndReturn(power_state.SHUTDOWN)
self.compute.driver.plug_vifs(instance, mox.IgnoreArg())
self.compute._get_instance_volume_block_device_info(mox.IgnoreArg(),
instance).AndReturn('fake-bdm')
@ -330,7 +334,10 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
'_get_instance_volume_block_device_info')
self.mox.StubOutWithMock(self.compute.driver, 'get_info')
self.mox.StubOutWithMock(instance, 'save')
self.mox.StubOutWithMock(self.compute, '_retry_reboot')
self.compute._retry_reboot(self.context, instance).AndReturn(
(False, None))
compute_utils.get_nw_info_for_instance(instance).AndReturn(
network_model.NetworkInfo())
self.compute.driver.plug_vifs(instance, [])
@ -341,6 +348,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
instance.save()
self.compute.driver.get_info(instance).AndReturn(
{'state': power_state.SHUTDOWN})
self.compute.driver.get_info(instance).AndReturn(
{'state': power_state.SHUTDOWN})
self.mox.ReplayAll()
@ -472,6 +481,91 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
self.compute._init_instance(self.context, instance)
self.mox.VerifyAll()
def _test_init_instance_retries_reboot(self, instance, reboot_type,
return_power_state):
with contextlib.nested(
mock.patch.object(self.compute, '_get_power_state',
return_value=return_power_state),
mock.patch.object(self.compute.compute_rpcapi, 'reboot_instance'),
mock.patch.object(compute_utils, 'get_nw_info_for_instance')
) as (
_get_power_state,
reboot_instance,
get_nw_info_for_instance
):
self.compute._init_instance(self.context, instance)
call = mock.call(self.context, instance, block_device_info=None,
reboot_type=reboot_type)
reboot_instance.assert_has_calls([call])
def test_init_instance_retries_reboot_pending(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.task_state = task_states.REBOOT_PENDING
for state in vm_states.ALLOW_SOFT_REBOOT:
instance.vm_state = state
self._test_init_instance_retries_reboot(instance, 'SOFT',
power_state.RUNNING)
def test_init_instance_retries_reboot_pending_hard(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.task_state = task_states.REBOOT_PENDING_HARD
for state in vm_states.ALLOW_HARD_REBOOT:
# NOTE(dave-mcnally) while a reboot of a vm in error state is
# possible we don't attempt to recover an error during init
if state == vm_states.ERROR:
continue
instance.vm_state = state
self._test_init_instance_retries_reboot(instance, 'HARD',
power_state.RUNNING)
def test_init_instance_retries_reboot_started(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.REBOOT_STARTED
self._test_init_instance_retries_reboot(instance, 'HARD',
power_state.NOSTATE)
def test_init_instance_retries_reboot_started_hard(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.REBOOT_STARTED_HARD
self._test_init_instance_retries_reboot(instance, 'HARD',
power_state.NOSTATE)
def _test_init_instance_cleans_reboot_state(self, instance):
with contextlib.nested(
mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.RUNNING),
mock.patch.object(self.compute, '_instance_update'),
mock.patch.object(compute_utils, 'get_nw_info_for_instance')
) as (
_get_power_state,
_instance_update,
get_nw_info_for_instance
):
self.compute._init_instance(self.context, instance)
call = mock.call(self.context, 'foo', vm_state='active',
task_state=None)
_instance_update.assert_has_calls([call])
def test_init_instance_cleans_image_state_reboot_started(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.REBOOT_STARTED
self._test_init_instance_cleans_reboot_state(instance)
def test_init_instance_cleans_image_state_reboot_started_hard(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.REBOOT_STARTED_HARD
self._test_init_instance_cleans_reboot_state(instance)
def test_get_instances_on_driver(self):
fake_context = context.get_admin_context()
@ -935,6 +1029,74 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
events[1])
do_test()
def test_retry_reboot_pending_soft(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.task_state = task_states.REBOOT_PENDING
instance.vm_state = vm_states.ACTIVE
with mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.RUNNING):
allow_reboot, reboot_type = self.compute._retry_reboot(
context, instance)
self.assertTrue(allow_reboot)
self.assertEqual(reboot_type, 'SOFT')
def test_retry_reboot_pending_hard(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.task_state = task_states.REBOOT_PENDING_HARD
instance.vm_state = vm_states.ACTIVE
with mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.RUNNING):
allow_reboot, reboot_type = self.compute._retry_reboot(
context, instance)
self.assertTrue(allow_reboot)
self.assertEqual(reboot_type, 'HARD')
def test_retry_reboot_starting_soft_off(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.task_state = task_states.REBOOT_STARTED
with mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.NOSTATE):
allow_reboot, reboot_type = self.compute._retry_reboot(
context, instance)
self.assertTrue(allow_reboot)
self.assertEqual(reboot_type, 'HARD')
def test_retry_reboot_starting_hard_off(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.task_state = task_states.REBOOT_STARTED_HARD
with mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.NOSTATE):
allow_reboot, reboot_type = self.compute._retry_reboot(
context, instance)
self.assertTrue(allow_reboot)
self.assertEqual(reboot_type, 'HARD')
def test_retry_reboot_starting_hard_on(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.task_state = task_states.REBOOT_STARTED_HARD
with mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.RUNNING):
allow_reboot, reboot_type = self.compute._retry_reboot(
context, instance)
self.assertFalse(allow_reboot)
self.assertEqual(reboot_type, 'HARD')
def test_retry_reboot_no_reboot(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.task_state = 'bar'
with mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.RUNNING):
allow_reboot, reboot_type = self.compute._retry_reboot(
context, instance)
self.assertFalse(allow_reboot)
self.assertEqual(reboot_type, 'HARD')
class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
def setUp(self):

View File

@ -23,6 +23,8 @@ import mock
from oslo.config import cfg
from nova.compute import flavors
from nova.compute import power_state
from nova.compute import task_states
from nova.compute import utils as compute_utils
from nova import context
from nova import db
@ -721,3 +723,27 @@ class ComputeUtilsGetNWInfo(test.TestCase):
self.assertIsNone(inst['info_cache'])
result = compute_utils.get_nw_info_for_instance(inst)
self.assertEqual(jsonutils.dumps([]), result.json())
class ComputeUtilsGetRebootTypes(test.TestCase):
def setUp(self):
super(ComputeUtilsGetRebootTypes, self).setUp()
self.context = context.RequestContext('fake', 'fake')
def test_get_reboot_type_started_soft(self):
reboot_type = compute_utils.get_reboot_type(task_states.REBOOT_STARTED,
power_state.RUNNING)
self.assertEqual(reboot_type, 'SOFT')
def test_get_reboot_type_pending_soft(self):
reboot_type = compute_utils.get_reboot_type(task_states.REBOOT_PENDING,
power_state.RUNNING)
self.assertEqual(reboot_type, 'SOFT')
def test_get_reboot_type_hard(self):
reboot_type = compute_utils.get_reboot_type('foo', power_state.RUNNING)
self.assertEqual(reboot_type, 'HARD')
def test_get_reboot_not_running_hard(self):
reboot_type = compute_utils.get_reboot_type('foo', 'bar')
self.assertEqual(reboot_type, 'HARD')