Fix scheduler error handler

Fixes bug 904971 Scheduler error handler was looking for instance_id when it may or may not exist. Added the proper code for it to determine whether the instance was actually created in the DB or not and how to find its ID. Note: there's some pretty nasty stuff in here, but unavoidable without larger changes. I'd like to hold off on these larger changes, because the problem should be solved with some of the scalability work coming. Tests included. Change-Id: Ief5fde8128437c9dc257af9c4d0c2950d0962ce5
2011-12-15 14:16:42 -08:00 · 2011-12-15 14:16:42 -08:00 · baf7e02f29
parent 3f7353d141
commit baf7e02f29
7 changed files with 70 additions and 16 deletions
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@ -310,7 +310,7 @@ class API(base.Base):
                    context, instance_type, image, base_options,
                    security_group, block_device_mapping)
            # Tells scheduler we created the instance already.
-            base_options['id'] = instance['id']
+            base_options['uuid'] = instance['uuid']
            rpc_method = rpc.cast
        else:
            # We need to wait for the scheduler to create the instance
--- a/nova/scheduler/chance.py
+++ b/nova/scheduler/chance.py
@ -74,7 +74,10 @@ class ChanceScheduler(driver.Scheduler):
            driver.cast_to_compute_host(context, host,
                    'run_instance', instance_uuid=instance['uuid'], **kwargs)
            instances.append(driver.encode_instance(instance))
-
+            # So if we loop around, create_instance_db_entry will actually
+            # create a new entry, instead of assume it's been created
+            # already
+            del request_spec['instance_properties']['uuid']
        return instances

    def schedule_prep_resize(self, context, request_spec, *args, **kwargs):
--- a/nova/scheduler/distributed_scheduler.py
+++ b/nova/scheduler/distributed_scheduler.py
@ -173,7 +173,12 @@ class DistributedScheduler(driver.Scheduler):
        instance = self.create_instance_db_entry(context, request_spec)
        driver.cast_to_compute_host(context, weighted_host.host,
                'run_instance', instance_uuid=instance['uuid'], **kwargs)
-        return driver.encode_instance(instance, local=True)
+        inst = driver.encode_instance(instance, local=True)
+        # So if another instance is created, create_instance_db_entry will
+        # actually create a new entry, instead of assume it's been created
+        # already
+        del request_spec['instance_properties']['uuid']
+        return inst

    def _make_weighted_host_from_blob(self, blob):
        """Returns the decrypted blob as a WeightedHost object
--- a/nova/scheduler/driver.py
+++ b/nova/scheduler/driver.py
@ -147,9 +147,9 @@ class Scheduler(object):
    def create_instance_db_entry(self, context, request_spec):
        """Create instance DB entry based on request_spec"""
        base_options = request_spec['instance_properties']
-        if base_options.get('id'):
+        if base_options.get('uuid'):
            # Instance was already created before calling scheduler
-            return db.instance_get(context, base_options['id'])
+            return db.instance_get_by_uuid(context, base_options['uuid'])
        image = request_spec['image']
        instance_type = request_spec.get('instance_type')
        security_group = request_spec.get('security_group', 'default')
@ -158,6 +158,10 @@ class Scheduler(object):
        instance = self.compute_api.create_db_entry_for_new_instance(
                context, instance_type, image, base_options,
                security_group, block_device_mapping)
+        # NOTE(comstud): This needs to be set for the generic exception
+        # checking in scheduler manager, so that it'll set this instance
+        # to ERROR properly.
+        base_options['uuid'] = instance['uuid']
        return instance

    def schedule(self, context, topic, method, *_args, **_kwargs):
--- a/nova/scheduler/manager.py
+++ b/nova/scheduler/manager.py
@ -108,18 +108,29 @@ class SchedulerManager(manager.Manager):
            with utils.save_and_reraise_exception():
                self._set_instance_error(method, context, ex, *args, **kwargs)

-    # NOTE (David Subiros) : If the exception is raised ruing run_instance
-    #                        method, the DB record probably does not exist yet.
+    # NOTE (David Subiros) : If the exception is raised during run_instance
+    #                        method, we may or may not have an instance_id
    def _set_instance_error(self, method, context, ex, *args, **kwargs):
        """Sets VM to Error state"""
        LOG.warning(_("Failed to schedule_%(method)s: %(ex)s") % locals())
-        if method == "start_instance" or method == "run_instance":
-            instance_id = kwargs['instance_id']
-            if instance_id:
-                LOG.warning(_("Setting instance %(instance_id)s to "
-                              "ERROR state.") % locals())
-                db.instance_update(context, instance_id,
-                                    {'vm_state': vm_states.ERROR})
+        if method != "start_instance" and method != "run_instance":
+            return
+        # FIXME(comstud): Clean this up after fully on UUIDs.
+        instance_id = kwargs.get('instance_uuid', kwargs.get('instance_id'))
+        if not instance_id:
+            # FIXME(comstud): We should make this easier.  run_instance
+            # only sends a request_spec, and an instance may or may not
+            # have been created in the API (or scheduler) already.  If it
+            # was created, there's a 'uuid' set in the instance_properties
+            # of the request_spec.
+            request_spec = kwargs.get('request_spec', {})
+            properties = request_spec.get('instance_properties', {})
+            instance_id = properties.get('uuid', {})
+        if instance_id:
+            LOG.warning(_("Setting instance %(instance_id)s to "
+                    "ERROR state.") % locals())
+            db.instance_update(context, instance_id,
+                    {'vm_state': vm_states.ERROR})

    # NOTE (masumotok) : This method should be moved to nova.api.ec2.admin.
    #                    Based on bexar design summit discussion,
--- a/nova/scheduler/simple.py
+++ b/nova/scheduler/simple.py
@ -82,6 +82,10 @@ class SimpleScheduler(chance.ChanceScheduler):
            driver.cast_to_compute_host(context, host, 'run_instance',
                    instance_uuid=instance_ref['uuid'], **_kwargs)
            instances.append(driver.encode_instance(instance_ref))
+            # So if we loop around, create_instance_db_entry will actually
+            # create a new entry, instead of assume it's been created
+            # already
+            del request_spec['instance_properties']['uuid']
        return instances

    def schedule_start_instance(self, context, instance_id, *_args, **_kwargs):
--- a/nova/tests/scheduler/test_scheduler.py
+++ b/nova/tests/scheduler/test_scheduler.py
@ -113,6 +113,7 @@ def _fake_create_instance_db_entry(simple_self, context, request_spec):
    instance = _create_instance_from_spec(request_spec)
    global instance_uuids
    instance_uuids.append(instance['uuid'])
+    request_spec['instance_properties']['uuid'] = instance['uuid']
    return instance


@ -236,15 +237,41 @@ class SchedulerTestCase(test.TestCase):
        scheduler = manager.SchedulerManager()
        ins_ref = _create_instance(task_state=task_states.STARTING,
                                   vm_state=vm_states.STOPPED)
-        self.stubs = stubout.StubOutForTesting()
        self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser)
-        self.mox.StubOutWithMock(rpc, 'cast', use_mock_anything=True)
        ctxt = context.get_admin_context()
        scheduler.start_instance(ctxt, 'topic', instance_id=ins_ref['id'])
        # assert that the instance goes to ERROR state
        self._assert_state({'vm_state': vm_states.ERROR,
                            'task_state': task_states.STARTING})

+    def test_no_valid_host_exception_on_run_with_id(self):
+        """check the vm goes to ERROR state if run_instance fails"""
+
+        def NoValidHost_raiser(context, topic, *args, **kwargs):
+            raise exception.NoValidHost(_("Test NoValidHost exception"))
+        scheduler = manager.SchedulerManager()
+        ins_ref = _create_instance(task_state=task_states.STARTING,
+                                   vm_state=vm_states.STOPPED)
+        self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser)
+        ctxt = context.get_admin_context()
+        request_spec = {'instance_properties': {'uuid': ins_ref['uuid']}}
+        scheduler.run_instance(ctxt, 'topic', request_spec=request_spec)
+        # assert that the instance goes to ERROR state
+        self._assert_state({'vm_state': vm_states.ERROR,
+                            'task_state': task_states.STARTING})
+
+    def test_no_valid_host_exception_on_run_without_id(self):
+        """check error handler doesn't raise if instance wasn't created"""
+
+        def NoValidHost_raiser(context, topic, *args, **kwargs):
+            raise exception.NoValidHost(_("Test NoValidHost exception"))
+        scheduler = manager.SchedulerManager()
+        self.stubs.Set(TestDriver, 'schedule', NoValidHost_raiser)
+        ctxt = context.get_admin_context()
+        request_spec = {'instance_properties': {}}
+        scheduler.run_instance(ctxt, 'topic', request_spec=request_spec)
+        # No error
+
    def test_show_host_resources_no_project(self):
        """No instance are running on the given host."""