Recover from IMAGE-* state on compute manager start-up
If a compute manager is stopped / fails during certain IMAGE-* operations then the instance will be left stuck in a transitional task_state This change handles three possible task states IMAGE_PENDING_UPLOAD, IMAGE_UPLOADING, and IMAGE_SNAPSHOT. Both IMAGE_PENDING_UPLOAD and IMAGE_UPLOADING are task states set after the request has gotten to the compute manager and so we can clear these safely knowing the operation has ended/failed with the restart of the compute manager. With IMAGE-SNAPSHOT it's possible this state was set in the API so we revert the instance to a task_state of None but the change also makes provision to set the task_state to IMAGE-SNAPSHOT once the request comes through to the compute manager so the current operation is clear once it begins again after the compute manager is back up and running. Related to blueprint recover-stuck-state Change-Id: I665c0ec735968aa0d7553dfc0cfb2373f93451f6
This commit is contained in:
parent
ff84a42910
commit
56b4d81e1a
@ -909,7 +909,7 @@ class _TargetedMessageMethods(_BaseMessageMethods):
|
||||
def snapshot_instance(self, message, instance, image_id):
|
||||
"""Snapshot an instance in its cell."""
|
||||
instance.refresh()
|
||||
instance.task_state = task_states.IMAGE_SNAPSHOT
|
||||
instance.task_state = task_states.IMAGE_SNAPSHOT_PENDING
|
||||
instance.save(expected_task_state=[None])
|
||||
self.compute_rpcapi.snapshot_instance(message.ctxt,
|
||||
instance,
|
||||
|
@ -1855,8 +1855,7 @@ class API(base.Base):
|
||||
|
||||
# NOTE(comstud): Any changes to this method should also be made
|
||||
# to the snapshot_instance() method in nova/cells/messaging.py
|
||||
|
||||
instance.task_state = task_states.IMAGE_SNAPSHOT
|
||||
instance.task_state = task_states.IMAGE_SNAPSHOT_PENDING
|
||||
instance.save(expected_task_state=[None])
|
||||
|
||||
self.compute_rpcapi.snapshot_instance(context, instance,
|
||||
|
@ -676,6 +676,16 @@ class ComputeManager(manager.Manager):
|
||||
vm_state=vm_states.ERROR)
|
||||
return
|
||||
|
||||
if (instance.vm_state != vm_states.ERROR and
|
||||
instance.task_state in [task_states.IMAGE_PENDING_UPLOAD,
|
||||
task_states.IMAGE_UPLOADING,
|
||||
task_states.IMAGE_SNAPSHOT]):
|
||||
LOG.debug(_("Instance in transitional state %s at start-up "
|
||||
"clearing task state"),
|
||||
instance['task_state'], instance=instance)
|
||||
instance = self._instance_update(context, instance.uuid,
|
||||
task_state=None)
|
||||
|
||||
net_info = compute_utils.get_nw_info_for_instance(instance)
|
||||
try:
|
||||
self.driver.plug_vifs(instance, net_info)
|
||||
@ -2393,6 +2403,26 @@ class ComputeManager(manager.Manager):
|
||||
:param instance: an Instance dict
|
||||
:param image_id: glance.db.sqlalchemy.models.Image.Id
|
||||
"""
|
||||
# NOTE(dave-mcnally) the task state will already be set by the api
|
||||
# but if the compute manager has crashed/been restarted prior to the
|
||||
# request getting here the task state may have been cleared so we set
|
||||
# it again and things continue normally
|
||||
try:
|
||||
instance.task_state = task_states.IMAGE_SNAPSHOT
|
||||
instance.save(
|
||||
expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
|
||||
except exception.InstanceNotFound:
|
||||
# possiblity instance no longer exists, no point in continuing
|
||||
LOG.debug(_("Instance not found, could not set state %s "
|
||||
"for instance."),
|
||||
task_states.IMAGE_SNAPSHOT, instance=instance)
|
||||
return
|
||||
|
||||
except exception.UnexpectedDeletingTaskStateError:
|
||||
LOG.debug(_("Instance being deleted, snapshot cannot continue"),
|
||||
instance=instance)
|
||||
return
|
||||
|
||||
self._snapshot_instance(context, image_id, instance,
|
||||
task_states.IMAGE_SNAPSHOT)
|
||||
|
||||
|
@ -33,6 +33,7 @@ SPAWNING = 'spawning'
|
||||
|
||||
# possible task states during snapshot()
|
||||
IMAGE_SNAPSHOT = 'image_snapshot'
|
||||
IMAGE_SNAPSHOT_PENDING = 'image_snapshot_pending'
|
||||
IMAGE_PENDING_UPLOAD = 'image_pending_upload'
|
||||
IMAGE_UPLOADING = 'image_uploading'
|
||||
|
||||
|
@ -1288,7 +1288,8 @@ class CellsTargetedMethodsTestCase(test.TestCase):
|
||||
self.mox.StubOutWithMock(meth_cls.compute_rpcapi, 'snapshot_instance')
|
||||
|
||||
def check_state(expected_task_state=None):
|
||||
self.assertEqual(task_states.IMAGE_SNAPSHOT, inst.task_state)
|
||||
self.assertEqual(task_states.IMAGE_SNAPSHOT_PENDING,
|
||||
inst.task_state)
|
||||
|
||||
inst.refresh()
|
||||
inst.save(expected_task_state=[None]).WithSideEffects(check_state)
|
||||
|
@ -2515,7 +2515,7 @@ class ComputeTestCase(BaseTestCase):
|
||||
None, True, None, False)
|
||||
instance = db.instance_update(
|
||||
self.context, instance['uuid'],
|
||||
{"task_state": task_states.IMAGE_SNAPSHOT})
|
||||
{"task_state": task_states.IMAGE_SNAPSHOT_PENDING})
|
||||
return self._objectify(instance)
|
||||
|
||||
def test_snapshot(self):
|
||||
|
@ -1328,7 +1328,8 @@ class _ComputeAPIUnitTestMixIn(object):
|
||||
mock_method.AndReturn(fake_image)
|
||||
|
||||
def check_state(expected_task_state=None):
|
||||
expected_state = (is_snapshot and task_states.IMAGE_SNAPSHOT or
|
||||
expected_state = (is_snapshot and
|
||||
task_states.IMAGE_SNAPSHOT_PENDING or
|
||||
task_states.IMAGE_BACKUP)
|
||||
self.assertEqual(expected_state, instance.task_state)
|
||||
|
||||
|
@ -384,6 +384,37 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
|
||||
instance.task_state = task_states.SPAWNING
|
||||
self._test_init_instance_sets_building_tasks_error(instance)
|
||||
|
||||
def _test_init_instance_cleans_image_states(self, instance):
|
||||
with contextlib.nested(
|
||||
mock.patch.object(self.compute, '_instance_update')
|
||||
) as (
|
||||
_instance_update,
|
||||
):
|
||||
self.compute._init_instance(self.context, instance)
|
||||
call = mock.call(self.context, 'foo', task_state=None)
|
||||
_instance_update.assert_has_calls([call])
|
||||
|
||||
def test_init_instance_cleans_image_state_pending_upload(self):
|
||||
instance = instance_obj.Instance(self.context)
|
||||
instance.uuid = 'foo'
|
||||
instance.vm_state = vm_states.ACTIVE
|
||||
instance.task_state = task_states.IMAGE_PENDING_UPLOAD
|
||||
self._test_init_instance_cleans_image_states(instance)
|
||||
|
||||
def test_init_instance_cleans_image_state_uploading(self):
|
||||
instance = instance_obj.Instance(self.context)
|
||||
instance.uuid = 'foo'
|
||||
instance.vm_state = vm_states.ACTIVE
|
||||
instance.task_state = task_states.IMAGE_UPLOADING
|
||||
self._test_init_instance_cleans_image_states(instance)
|
||||
|
||||
def test_init_instance_cleans_image_state_snapshot(self):
|
||||
instance = instance_obj.Instance(self.context)
|
||||
instance.uuid = 'foo'
|
||||
instance.vm_state = vm_states.ACTIVE
|
||||
instance.task_state = task_states.IMAGE_SNAPSHOT
|
||||
self._test_init_instance_cleans_image_states(instance)
|
||||
|
||||
def test_get_instances_on_driver(self):
|
||||
fake_context = context.get_admin_context()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user