Recover from IMAGE-* state on compute manager start-up

If a compute manager is stopped / fails during certain IMAGE-*
operations then the instance will be left stuck in a transitional
task_state

This change handles three possible task states IMAGE_PENDING_UPLOAD,
IMAGE_UPLOADING, and IMAGE_SNAPSHOT.

Both IMAGE_PENDING_UPLOAD and IMAGE_UPLOADING are task states set
after the request has gotten to the compute manager and so we can
clear these safely knowing the operation has ended/failed with the
restart of the compute manager.

With IMAGE-SNAPSHOT it's possible this state was set in the API so
we revert the instance to a task_state of None but the change also
makes provision to set the task_state to IMAGE-SNAPSHOT once the
request comes through to the compute manager so the current
operation is clear once it begins again after the compute manager is
back up and running.

Related to blueprint recover-stuck-state
Change-Id: I665c0ec735968aa0d7553dfc0cfb2373f93451f6
This commit is contained in:
David McNally 2013-11-12 14:28:27 +00:00
parent ff84a42910
commit 56b4d81e1a
8 changed files with 69 additions and 6 deletions

View File

@ -909,7 +909,7 @@ class _TargetedMessageMethods(_BaseMessageMethods):
def snapshot_instance(self, message, instance, image_id):
"""Snapshot an instance in its cell."""
instance.refresh()
instance.task_state = task_states.IMAGE_SNAPSHOT
instance.task_state = task_states.IMAGE_SNAPSHOT_PENDING
instance.save(expected_task_state=[None])
self.compute_rpcapi.snapshot_instance(message.ctxt,
instance,

View File

@ -1855,8 +1855,7 @@ class API(base.Base):
# NOTE(comstud): Any changes to this method should also be made
# to the snapshot_instance() method in nova/cells/messaging.py
instance.task_state = task_states.IMAGE_SNAPSHOT
instance.task_state = task_states.IMAGE_SNAPSHOT_PENDING
instance.save(expected_task_state=[None])
self.compute_rpcapi.snapshot_instance(context, instance,

View File

@ -676,6 +676,16 @@ class ComputeManager(manager.Manager):
vm_state=vm_states.ERROR)
return
if (instance.vm_state != vm_states.ERROR and
instance.task_state in [task_states.IMAGE_PENDING_UPLOAD,
task_states.IMAGE_UPLOADING,
task_states.IMAGE_SNAPSHOT]):
LOG.debug(_("Instance in transitional state %s at start-up "
"clearing task state"),
instance['task_state'], instance=instance)
instance = self._instance_update(context, instance.uuid,
task_state=None)
net_info = compute_utils.get_nw_info_for_instance(instance)
try:
self.driver.plug_vifs(instance, net_info)
@ -2393,6 +2403,26 @@ class ComputeManager(manager.Manager):
:param instance: an Instance dict
:param image_id: glance.db.sqlalchemy.models.Image.Id
"""
# NOTE(dave-mcnally) the task state will already be set by the api
# but if the compute manager has crashed/been restarted prior to the
# request getting here the task state may have been cleared so we set
# it again and things continue normally
try:
instance.task_state = task_states.IMAGE_SNAPSHOT
instance.save(
expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
except exception.InstanceNotFound:
# possiblity instance no longer exists, no point in continuing
LOG.debug(_("Instance not found, could not set state %s "
"for instance."),
task_states.IMAGE_SNAPSHOT, instance=instance)
return
except exception.UnexpectedDeletingTaskStateError:
LOG.debug(_("Instance being deleted, snapshot cannot continue"),
instance=instance)
return
self._snapshot_instance(context, image_id, instance,
task_states.IMAGE_SNAPSHOT)

View File

@ -33,6 +33,7 @@ SPAWNING = 'spawning'
# possible task states during snapshot()
IMAGE_SNAPSHOT = 'image_snapshot'
IMAGE_SNAPSHOT_PENDING = 'image_snapshot_pending'
IMAGE_PENDING_UPLOAD = 'image_pending_upload'
IMAGE_UPLOADING = 'image_uploading'

View File

@ -1288,7 +1288,8 @@ class CellsTargetedMethodsTestCase(test.TestCase):
self.mox.StubOutWithMock(meth_cls.compute_rpcapi, 'snapshot_instance')
def check_state(expected_task_state=None):
self.assertEqual(task_states.IMAGE_SNAPSHOT, inst.task_state)
self.assertEqual(task_states.IMAGE_SNAPSHOT_PENDING,
inst.task_state)
inst.refresh()
inst.save(expected_task_state=[None]).WithSideEffects(check_state)

View File

@ -2515,7 +2515,7 @@ class ComputeTestCase(BaseTestCase):
None, True, None, False)
instance = db.instance_update(
self.context, instance['uuid'],
{"task_state": task_states.IMAGE_SNAPSHOT})
{"task_state": task_states.IMAGE_SNAPSHOT_PENDING})
return self._objectify(instance)
def test_snapshot(self):

View File

@ -1328,7 +1328,8 @@ class _ComputeAPIUnitTestMixIn(object):
mock_method.AndReturn(fake_image)
def check_state(expected_task_state=None):
expected_state = (is_snapshot and task_states.IMAGE_SNAPSHOT or
expected_state = (is_snapshot and
task_states.IMAGE_SNAPSHOT_PENDING or
task_states.IMAGE_BACKUP)
self.assertEqual(expected_state, instance.task_state)

View File

@ -384,6 +384,37 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
instance.task_state = task_states.SPAWNING
self._test_init_instance_sets_building_tasks_error(instance)
def _test_init_instance_cleans_image_states(self, instance):
with contextlib.nested(
mock.patch.object(self.compute, '_instance_update')
) as (
_instance_update,
):
self.compute._init_instance(self.context, instance)
call = mock.call(self.context, 'foo', task_state=None)
_instance_update.assert_has_calls([call])
def test_init_instance_cleans_image_state_pending_upload(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.IMAGE_PENDING_UPLOAD
self._test_init_instance_cleans_image_states(instance)
def test_init_instance_cleans_image_state_uploading(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.IMAGE_UPLOADING
self._test_init_instance_cleans_image_states(instance)
def test_init_instance_cleans_image_state_snapshot(self):
instance = instance_obj.Instance(self.context)
instance.uuid = 'foo'
instance.vm_state = vm_states.ACTIVE
instance.task_state = task_states.IMAGE_SNAPSHOT
self._test_init_instance_cleans_image_states(instance)
def test_get_instances_on_driver(self):
fake_context = context.get_admin_context()