Add the instance update calls from Compute

Now that the Scheduler has RPC calls to receive updates from compute whenever a compute node's instances change significantly, we need to add calls to compute to send those updates. DocImpact We add a new CONF option 'scheduler_instance_sync_interval'. This option is an integer representing the interval (in seconds) that the compute node waits between sending a sync message to the Scheduler. It defaults to 120 (2 minutes). Of course, if the CONF option 'scheduler_tracks_instance_changes' is False, the sync calls will not be made. If an operator notices in the logs that out-of-sync situations are not very common, this value can be increased to lower the number of RPC messages being sent. Likewise, if sync issues turn out to be a problem, this can be lowered to check more frequently. Partially-Implements: blueprint isolate-scheduler-db (https://blueprints.launchpad.net/nova/+spec/isolate-scheduler-db) Change-Id: I5e93ec949e2fa0e29c52beb20201696caad85776
2015-03-02 18:00:00 +00:00
parent 936f914035
commit b798f7c711
4 changed files with 153 additions and 26 deletions
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -82,6 +82,7 @@ from nova.openstack.common import periodic_task
 from nova import paths
 from nova import rpc
 from nova import safe_utils
+from nova.scheduler import client as scheduler_client
 from nova.scheduler import rpcapi as scheduler_rpcapi
 from nova import utils
 from nova.virt import block_device as driver_block_device
@@ -171,7 +172,14 @@ interval_opts = [
    cfg.IntOpt('block_device_allocate_retries_interval',
               default=3,
               help='Waiting time interval (seconds) between block'
-                    ' device allocation retries on failures')
+                    ' device allocation retries on failures'),
+    cfg.IntOpt('scheduler_instance_sync_interval',
+               default=120,
+               help='Waiting time interval (seconds) between sending the '
+                    'scheduler a list of current instance UUIDs to verify '
+                    'that its view of instances is in sync with nova. If the '
+                    'CONF option `scheduler_tracks_instance_changes` is '
+                    'False, changing this option will have no effect.'),
 ]

 timeout_opts = [
@@ -241,7 +249,8 @@ CONF.import_opt('html5_proxy_base_url', 'nova.rdp', group='rdp')
 CONF.import_opt('enabled', 'nova.console.serial', group='serial_console')
 CONF.import_opt('base_url', 'nova.console.serial', group='serial_console')
 CONF.import_opt('destroy_after_evacuate', 'nova.utils', group='workarounds')
-
+CONF.import_opt('scheduler_tracks_instance_changes',
+                'nova.scheduler.host_manager')

 LOG = logging.getLogger(__name__)

@@ -632,10 +641,12 @@ class ComputeManager(manager.Manager):
        self.consoleauth_rpcapi = consoleauth.rpcapi.ConsoleAuthAPI()
        self.cells_rpcapi = cells_rpcapi.CellsAPI()
        self.scheduler_rpcapi = scheduler_rpcapi.SchedulerAPI()
+        self.scheduler_client = scheduler_client.SchedulerClient()
        self._resource_tracker_dict = {}
        self.instance_events = InstanceEvents()
        self._sync_power_pool = eventlet.GreenPool()
        self._syncs_in_progress = {}
+        self.send_instance_updates = CONF.scheduler_tracks_instance_changes
        if CONF.max_concurrent_builds != 0:
            self._build_semaphore = eventlet.semaphore.Semaphore(
                CONF.max_concurrent_builds)
@@ -864,6 +875,7 @@ class ComputeManager(manager.Manager):
            else:
                self.consoleauth_rpcapi.delete_tokens_for_instance(context,
                        instance.uuid)
+        self._delete_scheduler_instance_info(context, instance.uuid)

    def _init_instance(self, context, instance):
        '''Initialize this instance during service init.'''
@@ -1217,6 +1229,7 @@ class ComputeManager(manager.Manager):
        finally:
            if CONF.defer_iptables_apply:
                self.driver.filter_defer_apply_off()
+            self._update_scheduler_instance_info(context, instances)

    def cleanup_host(self):
        self.driver.cleanup_host(host=self.host)
@@ -1969,6 +1982,40 @@ class ComputeManager(manager.Manager):
        instance.save(expected_task_state=task_states.SPAWNING)
        return instance

+    def _update_scheduler_instance_info(self, context, instance):
+        """Sends an InstanceList with created or updated Instance objects to
+        the Scheduler client.
+
+        In the case of init_host, the value passed will already be an
+        InstanceList. Other calls will send individual Instance objects that
+        have been created or resized. In this case, we create an InstanceList
+        object containing that Instance.
+        """
+        if not self.send_instance_updates:
+            return
+        if isinstance(instance, objects.Instance):
+            instance = objects.InstanceList(objects=[instance])
+        context = context.elevated()
+        self.scheduler_client.update_instance_info(context, self.host,
+                                                   instance)
+
+    def _delete_scheduler_instance_info(self, context, instance_uuid):
+        """Sends the uuid of the deleted Instance to the Scheduler client."""
+        if not self.send_instance_updates:
+            return
+        context = context.elevated()
+        self.scheduler_client.delete_instance_info(context, self.host,
+                                                   instance_uuid)
+
+    @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)
+    def _sync_scheduler_instance_info(self, context):
+        if not self.send_instance_updates:
+            return
+        context = context.elevated()
+        instances = objects.InstanceList.get_by_host(context, self.host)
+        uuids = [instance.uuid for instance in instances]
+        self.scheduler_client.sync_instance_info(context, self.host, uuids)
+
    def _notify_about_instance_usage(self, context, instance, event_suffix,
                                     network_info=None, system_metadata=None,
                                     extra_usage_info=None, fault=None):
@@ -2275,6 +2322,7 @@ class ComputeManager(manager.Manager):
                self._notify_about_instance_usage(context, instance,
                    'create.end', fault=e)

+        self._update_scheduler_instance_info(context, instance)
        self._notify_about_instance_usage(context, instance, 'create.end',
                extra_usage_info={'message': _('Success')},
                network_info=network_info)
@@ -2939,7 +2987,7 @@ class ComputeManager(manager.Manager):
                instance.progress = 0
                instance.save()
                self.stop_instance(context, instance)
-
+            self._update_scheduler_instance_info(context, instance)
            self._notify_about_instance_usage(
                    context, instance, "rebuild.end",
                    network_info=network_info,
@@ -3985,6 +4033,7 @@ class ComputeManager(manager.Manager):
        instance.launched_at = timeutils.utcnow()
        instance.save(expected_task_state=task_states.RESIZE_FINISH)

+        self._update_scheduler_instance_info(context, instance)
        self._notify_about_instance_usage(
            context, instance, "finish_resize.end",
            network_info=network_info)
@@ -4298,6 +4347,7 @@ class ComputeManager(manager.Manager):
        instance.task_state = None
        instance.save(expected_task_state=[task_states.SHELVING,
                                           task_states.SHELVING_OFFLOADING])
+        self._delete_scheduler_instance_info(context, instance.uuid)
        self._notify_about_instance_usage(context, instance,
                'shelve_offload.end')

@@ -4383,6 +4433,7 @@ class ComputeManager(manager.Manager):
        self._unshelve_instance_key_restore(instance, scrubbed_keys)
        self._update_instance_after_spawn(context, instance)
        instance.save(expected_task_state=task_states.SPAWNING)
+        self._update_scheduler_instance_info(context, instance)
        self._notify_about_instance_usage(context, instance, 'unshelve.end')

    @messaging.expected_exceptions(NotImplementedError)
@@ -5245,6 +5296,7 @@ class ComputeManager(manager.Manager):
        # host even before next periodic task.
        self.update_available_resource(ctxt)

+        self._update_scheduler_instance_info(ctxt, instance)
        self._notify_about_instance_usage(ctxt, instance,
                                          "live_migration._post.end",
                                          network_info=network_info)
--- a/nova/tests/unit/api/ec2/test_cinder_cloud.py
+++ b/nova/tests/unit/api/ec2/test_cinder_cloud.py
@@ -26,6 +26,7 @@ from nova.api.ec2 import cloud
 from nova.api.ec2 import ec2utils
 from nova.compute import api as compute_api
 from nova.compute import flavors
+from nova.compute import manager as compute_manager
 from nova.compute import utils as compute_utils
 from nova import context
 from nova import db
@@ -153,6 +154,12 @@ class CinderCloudTestCase(test.TestCase):
        self.volume_api = volume.API()
        self.volume_api.reset_fake_api(self.context)

+        self.stubs.Set(compute_manager.ComputeManager,
+                       '_update_scheduler_instance_info', dumb)
+        self.stubs.Set(compute_manager.ComputeManager,
+                       '_delete_scheduler_instance_info', dumb)
+        self.stubs.Set(compute_manager.ComputeManager,
+                       '_sync_scheduler_instance_info', dumb)
        self.useFixture(cast_as_call.CastAsCall(self.stubs))

        # make sure we can map ami-00000001/2 to a uuid in FakeImageService
--- a/nova/tests/unit/api/ec2/test_cloud.py
+++ b/nova/tests/unit/api/ec2/test_cloud.py
@@ -39,6 +39,7 @@ from nova.api.metadata import password
 from nova import availability_zones
 from nova.compute import api as compute_api
 from nova.compute import flavors
+from nova.compute import manager as compute_manager
 from nova.compute import power_state
 from nova.compute import rpcapi as compute_rpcapi
 from nova.compute import utils as compute_utils
@@ -201,6 +202,12 @@ class CloudTestCase(test.TestCase):
                                              is_admin=True)
        self.volume_api = volume.API()

+        self.stubs.Set(compute_manager.ComputeManager,
+                       '_update_scheduler_instance_info', dumb)
+        self.stubs.Set(compute_manager.ComputeManager,
+                       '_delete_scheduler_instance_info', dumb)
+        self.stubs.Set(compute_manager.ComputeManager,
+                       '_sync_scheduler_instance_info', dumb)
        self.useFixture(cast_as_call.CastAsCall(self.stubs))

        # make sure we can map ami-00000001/2 to a uuid in FakeImageService
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -26,6 +26,7 @@ from oslo_utils import importutils
 from oslo_utils import timeutils
 from oslo_utils import uuidutils

+import nova
 from nova.compute import build_results
 from nova.compute import manager
 from nova.compute import power_state
@@ -383,7 +384,6 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

    def test_init_host(self):
        our_host = self.compute.host
-        fake_context = 'fake-context'
        inst = fake_instance.fake_db_instance(
                vm_state=vm_states.ACTIVE,
                info_cache=dict(test_instance_info_cache.fake_info_cache,
@@ -393,19 +393,19 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

        def _do_mock_calls(defer_iptables_apply):
            self.compute.driver.init_host(host=our_host)
-            context.get_admin_context().AndReturn(fake_context)
+            context.get_admin_context().AndReturn(self.context)
            db.instance_get_all_by_host(
-                    fake_context, our_host, columns_to_join=['info_cache'],
+                    self.context, our_host, columns_to_join=['info_cache'],
                    use_slave=False
                    ).AndReturn(startup_instances)
            if defer_iptables_apply:
                self.compute.driver.filter_defer_apply_on()
-            self.compute._destroy_evacuated_instances(fake_context)
-            self.compute._init_instance(fake_context,
+            self.compute._destroy_evacuated_instances(self.context)
+            self.compute._init_instance(self.context,
                                        mox.IsA(objects.Instance))
-            self.compute._init_instance(fake_context,
+            self.compute._init_instance(self.context,
                                        mox.IsA(objects.Instance))
-            self.compute._init_instance(fake_context,
+            self.compute._init_instance(self.context,
                                        mox.IsA(objects.Instance))
            if defer_iptables_apply:
                self.compute.driver.filter_defer_apply_off()
@@ -460,7 +460,6 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
    def test_init_host_with_deleted_migration(self):
        our_host = self.compute.host
        not_our_host = 'not-' + our_host
-        fake_context = 'fake-context'

        deleted_instance = fake_instance.fake_instance_obj(
                self.context, host=not_our_host, uuid='fake-uuid')
@@ -475,8 +474,8 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        self.mox.StubOutWithMock(self.compute, '_get_instance_nw_info')

        self.compute.driver.init_host(host=our_host)
-        context.get_admin_context().AndReturn(fake_context)
-        db.instance_get_all_by_host(fake_context, our_host,
+        context.get_admin_context().AndReturn(self.context)
+        db.instance_get_all_by_host(self.context, our_host,
                                    columns_to_join=['info_cache'],
                                    use_slave=False
                                    ).AndReturn([])
@@ -484,13 +483,13 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

        # simulate failed instance
        self.compute._get_instances_on_driver(
-            fake_context, {'deleted': False}).AndReturn([deleted_instance])
-        self.compute._get_instance_nw_info(fake_context, deleted_instance
+            self.context, {'deleted': False}).AndReturn([deleted_instance])
+        self.compute._get_instance_nw_info(self.context, deleted_instance
            ).AndRaise(exception.InstanceNotFound(
                instance_id=deleted_instance['uuid']))
        # ensure driver.destroy is called so that driver may
        # clean up any dangling files
-        self.compute.driver.destroy(fake_context, deleted_instance,
+        self.compute.driver.destroy(self.context, deleted_instance,
            mox.IgnoreArg(), mox.IgnoreArg(), mox.IgnoreArg())

        self.mox.ReplayAll()
@@ -1034,8 +1033,6 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
            self.assertIsNone(init_return)

    def test_get_instances_on_driver(self):
-        fake_context = context.get_admin_context()
-
        driver_instances = []
        for x in xrange(10):
            driver_instances.append(fake_instance.fake_db_instance())
@@ -1047,7 +1044,7 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        self.compute.driver.list_instance_uuids().AndReturn(
                [inst['uuid'] for inst in driver_instances])
        db.instance_get_all_by_filters(
-                fake_context,
+                self.context,
                {'uuid': [inst['uuid'] for
                          inst in driver_instances]},
                'created_at', 'desc', columns_to_join=None,
@@ -1057,17 +1054,16 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

        self.mox.ReplayAll()

-        result = self.compute._get_instances_on_driver(fake_context)
+        result = self.compute._get_instances_on_driver(self.context)
        self.assertEqual([x['uuid'] for x in driver_instances],
                         [x['uuid'] for x in result])

    @mock.patch('nova.virt.driver.ComputeDriver.list_instance_uuids')
    @mock.patch('nova.db.api.instance_get_all_by_filters')
    def test_get_instances_on_driver_empty(self, mock_list, mock_db):
-        fake_context = context.get_admin_context()
        mock_list.return_value = []

-        result = self.compute._get_instances_on_driver(fake_context)
+        result = self.compute._get_instances_on_driver(self.context)
        # instance_get_all_by_filters should not be called
        self.assertEqual(0, mock_db.call_count)
        self.assertEqual([],
@@ -1078,7 +1074,6 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        # 'list_instance_uuids'
        self.compute.host = 'host'
        filters = {'host': self.compute.host}
-        fake_context = context.get_admin_context()

        self.flags(instance_name_template='inst-%i')

@@ -1102,14 +1097,14 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
        self.compute.driver.list_instances().AndReturn(
                [inst['name'] for inst in driver_instances])
        db.instance_get_all_by_filters(
-                fake_context, filters,
+                self.context, filters,
                'created_at', 'desc', columns_to_join=None,
                limit=None, marker=None,
                use_slave=True).AndReturn(all_instances)

        self.mox.ReplayAll()

-        result = self.compute._get_instances_on_driver(fake_context, filters)
+        result = self.compute._get_instances_on_driver(self.context, filters)
        self.assertEqual([x['uuid'] for x in driver_instances],
                         [x['uuid'] for x in result])

@@ -2326,6 +2321,70 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):

        self.assertFalse(log_mock.called)

+    @mock.patch.object(nova.scheduler.client.SchedulerClient,
+                       'update_instance_info')
+    def test_update_scheduler_instance_info(self, mock_update):
+        instance = objects.Instance(uuid='fake')
+        self.compute._update_scheduler_instance_info(self.context, instance)
+        self.assertEqual(mock_update.call_count, 1)
+        args = mock_update.call_args[0]
+        self.assertNotEqual(args[0], self.context)
+        self.assertIsInstance(args[0], self.context.__class__)
+        self.assertEqual(args[1], self.compute.host)
+        # Send a single instance; check that the method converts to an
+        # InstanceList
+        self.assertIsInstance(args[2], objects.InstanceList)
+        self.assertEqual(args[2].objects[0], instance)
+
+    @mock.patch.object(nova.scheduler.client.SchedulerClient,
+                       'delete_instance_info')
+    def test_delete_scheduler_instance_info(self, mock_delete):
+        self.compute._delete_scheduler_instance_info(self.context,
+                                                     mock.sentinel.inst_uuid)
+        self.assertEqual(mock_delete.call_count, 1)
+        args = mock_delete.call_args[0]
+        self.assertNotEqual(args[0], self.context)
+        self.assertIsInstance(args[0], self.context.__class__)
+        self.assertEqual(args[1], self.compute.host)
+        self.assertEqual(args[2], mock.sentinel.inst_uuid)
+
+    @mock.patch.object(nova.objects.InstanceList, 'get_by_host')
+    @mock.patch.object(nova.scheduler.client.SchedulerClient,
+                       'sync_instance_info')
+    def test_sync_scheduler_instance_info(self, mock_sync, mock_get_by_host):
+        inst1 = objects.Instance(uuid='fake1')
+        inst2 = objects.Instance(uuid='fake2')
+        inst3 = objects.Instance(uuid='fake3')
+        mock_get_by_host.return_value = objects.InstanceList(
+                objects=[inst1, inst2, inst3])
+        self.compute._sync_scheduler_instance_info(self.context)
+        self.assertEqual(mock_sync.call_count, 1)
+        args = mock_sync.call_args[0]
+        exp_uuids = [inst.uuid for inst in [inst1, inst2, inst3]]
+        self.assertIsInstance(args[0], self.context.__class__)
+        self.assertEqual(args[1], self.compute.host)
+        self.assertEqual(args[2], exp_uuids)
+
+    @mock.patch.object(nova.scheduler.client.SchedulerClient,
+                       'sync_instance_info')
+    @mock.patch.object(nova.scheduler.client.SchedulerClient,
+                       'delete_instance_info')
+    @mock.patch.object(nova.scheduler.client.SchedulerClient,
+                       'update_instance_info')
+    def test_scheduler_info_updates_off(self, mock_update, mock_delete,
+                                        mock_sync):
+        mgr = self.compute
+        mgr.send_instance_updates = False
+        mgr._update_scheduler_instance_info(self.context,
+                                            mock.sentinel.instance)
+        mgr._delete_scheduler_instance_info(self.context,
+                                            mock.sentinel.instance_uuid)
+        mgr._sync_scheduler_instance_info(self.context)
+        # None of the calls should have been made
+        self.assertFalse(mock_update.called)
+        self.assertFalse(mock_delete.called)
+        self.assertFalse(mock_sync.called)
+

 class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
    def setUp(self):
@@ -3323,13 +3382,15 @@ class ComputeManagerBuildInstanceTestCase(test.NoDBTestCase):
                self.assertIsNotNone(args[1].launched_at)

        with contextlib.nested(
+                mock.patch.object(self.compute,
+                    '_update_scheduler_instance_info'),
                mock.patch.object(self.compute.driver, 'spawn'),
                mock.patch.object(self.compute,
                    '_build_networks_for_instance', return_value=[]),
                mock.patch.object(self.instance, 'save'),
                mock.patch.object(self.compute, '_notify_about_instance_usage',
                    side_effect=fake_notify)
-        ) as (mock_spawn, mock_networks, mock_save, mock_notify):
+        ) as (mock_upd, mock_spawn, mock_networks, mock_save, mock_notify):
            self.compute._build_and_run_instance(self.context, self.instance,
                    self.image, self.injected_files, self.admin_pass,
                    self.requested_networks, self.security_groups,