Fix scheduler error handler

Fixes bug 904971

Scheduler error handler was looking for instance_id when it may or may
not exist.  Added the proper code for it to determine whether the
instance was actually created in the DB or not and how to find its ID.

Note: there's some pretty nasty stuff in here, but unavoidable without
larger changes.  I'd like to hold off on these larger changes, because
the problem should be solved with some of the scalability work coming.

Tests included.

Change-Id: Ief5fde8128437c9dc257af9c4d0c2950d0962ce5
This commit is contained in:
Chris Behrens 2011-12-15 14:16:42 -08:00
parent 3f7353d141
commit baf7e02f29
7 changed files with 70 additions and 16 deletions

View File

@ -310,7 +310,7 @@ class API(base.Base):
context, instance_type, image, base_options,
security_group, block_device_mapping)
# Tells scheduler we created the instance already.
base_options['id'] = instance['id']
base_options['uuid'] = instance['uuid']
rpc_method = rpc.cast
else:
# We need to wait for the scheduler to create the instance

View File

@ -74,7 +74,10 @@ class ChanceScheduler(driver.Scheduler):
driver.cast_to_compute_host(context, host,
'run_instance', instance_uuid=instance['uuid'], **kwargs)
instances.append(driver.encode_instance(instance))
# So if we loop around, create_instance_db_entry will actually
# create a new entry, instead of assume it's been created
# already
del request_spec['instance_properties']['uuid']
return instances
def schedule_prep_resize(self, context, request_spec, *args, **kwargs):

View File

@ -173,7 +173,12 @@ class DistributedScheduler(driver.Scheduler):
instance = self.create_instance_db_entry(context, request_spec)
driver.cast_to_compute_host(context, weighted_host.host,
'run_instance', instance_uuid=instance['uuid'], **kwargs)
return driver.encode_instance(instance, local=True)
inst = driver.encode_instance(instance, local=True)
# So if another instance is created, create_instance_db_entry will
# actually create a new entry, instead of assume it's been created
# already
del request_spec['instance_properties']['uuid']
return inst
def _make_weighted_host_from_blob(self, blob):
"""Returns the decrypted blob as a WeightedHost object

View File

@ -147,9 +147,9 @@ class Scheduler(object):
def create_instance_db_entry(self, context, request_spec):
"""Create instance DB entry based on request_spec"""
base_options = request_spec['instance_properties']
if base_options.get('id'):
if base_options.get('uuid'):
# Instance was already created before calling scheduler
return db.instance_get(context, base_options['id'])
return db.instance_get_by_uuid(context, base_options['uuid'])
image = request_spec['image']
instance_type = request_spec.get('instance_type')
security_group = request_spec.get('security_group', 'default')
@ -158,6 +158,10 @@ class Scheduler(object):
instance = self.compute_api.create_db_entry_for_new_instance(
context, instance_type, image, base_options,
security_group, block_device_mapping)
# NOTE(comstud): This needs to be set for the generic exception
# checking in scheduler manager, so that it'll set this instance
# to ERROR properly.
base_options['uuid'] = instance['uuid']
return instance
def schedule(self, context, topic, method, *_args, **_kwargs):

View File

@ -108,18 +108,29 @@ class SchedulerManager(manager.Manager):
with utils.save_and_reraise_exception():
self._set_instance_error(method, context, ex, *args, **kwargs)
# NOTE (David Subiros) : If the exception is raised ruing run_instance
# method, the DB record probably does not exist yet.
# NOTE (David Subiros) : If the exception is raised during run_instance
# method, we may or may not have an instance_id
def _set_instance_error(self, method, context, ex, *args, **kwargs):
"""Sets VM to Error state"""
LOG.warning(_("Failed to schedule_%(method)s: %(ex)s") % locals())
if method == "start_instance" or method == "run_instance":
instance_id = kwargs['instance_id']
if instance_id:
LOG.warning(_("Setting instance %(instance_id)s to "
"ERROR state.") % locals())
db.instance_update(context, instance_id,
{'vm_state': vm_states.ERROR})
if method != "start_instance" and method != "run_instance":
return
# FIXME(comstud): Clean this up after fully on UUIDs.
instance_id = kwargs.get('instance_uuid', kwargs.get('instance_id'))
if not instance_id:
# FIXME(comstud): We should make this easier. run_instance
# only sends a request_spec, and an instance may or may not
# have been created in the API (or scheduler) already. If it
# was created, there's a 'uuid' set in the instance_properties
# of the request_spec.
request_spec = kwargs.get('request_spec', {})
properties = request_spec.get('instance_properties', {})
instance_id = properties.get('uuid', {})
if instance_id:
LOG.warning(_("Setting instance %(instance_id)s to "
"ERROR state.") % locals())
db.instance_update(context, instance_id,
{'vm_state': vm_states.ERROR})
# NOTE (masumotok) : This method should be moved to nova.api.ec2.admin.
# Based on bexar design summit discussion,

View File

@ -82,6 +82,10 @@ class SimpleScheduler(chance.ChanceScheduler):
driver.cast_to_compute_host(context, host, 'run_instance',
instance_uuid=instance_ref['uuid'], **_kwargs)
instances.append(driver.encode_instance(instance_ref))
# So if we loop around, create_instance_db_entry will actually
# create a new entry, instead of assume it's been created
# already
del request_spec['instance_properties']['uuid']
return instances
def schedule_start_instance(self, context, instance_id, *_args, **_kwargs):

View File

@ -113,6 +113,7 @@ def _fake_create_instance_db_entry(simple_self, context, request_spec):
instance = _create_instance_from_spec(request_spec)
global instance_uuids
instance_uuids.append(instance['uuid'])
request_spec['instance_properties']['uuid'] = instance['uuid']
return instance
@ -236,15 +237,41 @@ class SchedulerTestCase(test.TestCase):
scheduler = manager.SchedulerManager()
ins_ref = _create_instance(task_state=task_states.STARTING,
vm_state=vm_states.STOPPED)
self.stubs = stubout.StubOutForTesting()
self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser)
self.mox.StubOutWithMock(rpc, 'cast', use_mock_anything=True)
ctxt = context.get_admin_context()
scheduler.start_instance(ctxt, 'topic', instance_id=ins_ref['id'])
# assert that the instance goes to ERROR state
self._assert_state({'vm_state': vm_states.ERROR,
'task_state': task_states.STARTING})
def test_no_valid_host_exception_on_run_with_id(self):
"""check the vm goes to ERROR state if run_instance fails"""
def NoValidHost_raiser(context, topic, *args, **kwargs):
raise exception.NoValidHost(_("Test NoValidHost exception"))
scheduler = manager.SchedulerManager()
ins_ref = _create_instance(task_state=task_states.STARTING,
vm_state=vm_states.STOPPED)
self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser)
ctxt = context.get_admin_context()
request_spec = {'instance_properties': {'uuid': ins_ref['uuid']}}
scheduler.run_instance(ctxt, 'topic', request_spec=request_spec)
# assert that the instance goes to ERROR state
self._assert_state({'vm_state': vm_states.ERROR,
'task_state': task_states.STARTING})
def test_no_valid_host_exception_on_run_without_id(self):
"""check error handler doesn't raise if instance wasn't created"""
def NoValidHost_raiser(context, topic, *args, **kwargs):
raise exception.NoValidHost(_("Test NoValidHost exception"))
scheduler = manager.SchedulerManager()
self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser)
ctxt = context.get_admin_context()
request_spec = {'instance_properties': {}}
scheduler.run_instance(ctxt, 'topic', request_spec=request_spec)
# No error
def test_show_host_resources_no_project(self):
"""No instance are running on the given host."""