Add accounting for orphans to resource tracker.

Add accounting for orphaned instances to resource tracker. Orphans are instances that, for whatever reason, exist on the hypervisor but are not accounted for in the Nova DB. Such instances would cause resource tracker to under-report usage numbers and result in out of memory errors during build. Change-Id: Icc970e34e01ff8c7dfb000889e5ea6e2d0421c77
2012-12-10 20:49:06 +00:00 · 2012-12-10 20:49:06 +00:00 · 07af4ceea7
commit 07af4ceea7
parent aef9802089
7 changed files with 164 additions and 22 deletions
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@ -268,6 +268,11 @@ class ResourceTracker(object):

        self._update_usage_from_migrations(resources, migrations)

+        # Detect and account for orphaned instances that may exist on the
+        # hypervisor, but are not in the DB:
+        orphans = self._find_orphaned_instances()
+        self._update_usage_from_orphans(resources, orphans)
+
        self._report_final_resource_view(resources)

        self._sync_compute_node(context, resources)
@ -364,8 +369,8 @@ class ResourceTracker(object):

    def _update_usage(self, resources, usage, sign=1):
        resources['memory_mb_used'] += sign * usage['memory_mb']
-        resources['local_gb_used'] += sign * usage['root_gb']
-        resources['local_gb_used'] += sign * usage['ephemeral_gb']
+        resources['local_gb_used'] += sign * usage.get('root_gb', 0)
+        resources['local_gb_used'] += sign * usage.get('ephemeral_gb', 0)

        # free ram and disk may be negative, depending on policy:
        resources['free_ram_mb'] = (resources['memory_mb'] -
@ -501,6 +506,40 @@ class ResourceTracker(object):
        for instance in instances:
            self._update_usage_from_instance(resources, instance)

+    def _find_orphaned_instances(self):
+        """Given the set of instances and migrations already account for
+        by resource tracker, sanity check the hypervisor to determine
+        if there are any "orphaned" instances left hanging around.
+
+        Orphans could be consuming memory and should be accounted for in
+        usage calculations to guard against potential out of memory
+        errors.
+        """
+        uuids1 = frozenset(self.tracked_instances.keys())
+        uuids2 = frozenset(self.tracked_migrations.keys())
+        uuids = uuids1 | uuids2
+
+        usage = self.driver.get_per_instance_usage()
+        vuuids = frozenset(usage.keys())
+
+        orphan_uuids = vuuids - uuids
+        orphans = [usage[uuid] for uuid in orphan_uuids]
+
+        return orphans
+
+    def _update_usage_from_orphans(self, resources, orphans):
+        """Include orphaned instances in usage."""
+        for orphan in orphans:
+            uuid = orphan['uuid']
+            memory_mb = orphan['memory_mb']
+
+            LOG.warn(_("Detected running orphan instance: %(uuid)s (consuming "
+                       "%(memory_mb)s MB memory") % locals())
+
+            # just record memory usage for the orphan
+            usage = {'memory_mb': orphan['memory_mb']}
+            self._update_usage(resources, usage)
+
    def _verify_resources(self, resources):
        resource_keys = ["vcpus", "memory_mb", "local_gb", "cpu_info",
                         "vcpus_used", "memory_mb_used", "local_gb_used"]
--- a/nova/tests/compute/test_resource_tracker.py
+++ b/nova/tests/compute/test_resource_tracker.py
@ -193,17 +193,17 @@ class BaseTestCase(test.TestCase):
        # only used in the subsequent notification:
        return (instance, instance)

-    def _tracker(self, host=None, unsupported=False):
+    def _driver(self):
+        return FakeVirtDriver()
+
+    def _tracker(self, host=None):

        if host is None:
            host = self.host

        node = "fakenode"

-        if unsupported:
-            driver = UnsupportedVirtDriver()
-        else:
-            driver = FakeVirtDriver()
+        driver = self._driver()

        tracker = resource_tracker.ResourceTracker(host, driver, node)
        return tracker
@ -215,10 +215,13 @@ class UnsupportedDriverTestCase(BaseTestCase):
    """
    def setUp(self):
        super(UnsupportedDriverTestCase, self).setUp()
-        self.tracker = self._tracker(unsupported=True)
+        self.tracker = self._tracker()
        # seed tracker with data:
        self.tracker.update_available_resource(self.context)

+    def _driver(self):
+        return UnsupportedVirtDriver()
+
    def test_disabled(self):
        # disabled = no compute node stats
        self.assertTrue(self.tracker.disabled)
@ -248,7 +251,7 @@ class UnsupportedDriverTestCase(BaseTestCase):
                root_gb=10)
        self.tracker.update_usage(self.context, instance)

-    def testDisabledResizeClaim(self):
+    def test_disabled_resize_claim(self):
        instance = self._fake_instance()
        instance_type = self._fake_instance_type_create()
        claim = self.tracker.resize_claim(self.context, instance,
@ -258,7 +261,7 @@ class UnsupportedDriverTestCase(BaseTestCase):
        self.assertEqual(instance_type['id'],
                claim.migration['new_instance_type_id'])

-    def testDisabledResizeContextClaim(self):
+    def test_disabled_resize_context_claim(self):
        instance = self._fake_instance()
        instance_type = self._fake_instance_type_create()
        with self.tracker.resize_claim(self.context, instance, instance_type) \
@ -327,18 +330,6 @@ class BaseTrackerTestCase(BaseTestCase):
        self.tracker.update_available_resource(self.context)
        self.limits = self._limits()

-        self._assert(FAKE_VIRT_MEMORY_MB, 'memory_mb')
-        self._assert(FAKE_VIRT_LOCAL_GB, 'local_gb')
-        self._assert(FAKE_VIRT_VCPUS, 'vcpus')
-        self._assert(0, 'memory_mb_used')
-        self._assert(0, 'local_gb_used')
-        self._assert(0, 'vcpus_used')
-        self._assert(0, 'running_vms')
-        self._assert(FAKE_VIRT_MEMORY_MB, 'free_ram_mb')
-        self._assert(FAKE_VIRT_LOCAL_GB, 'free_disk_gb')
-        self.assertFalse(self.tracker.disabled)
-        self.assertEqual(0, self.tracker.compute_node['current_workload'])
-
    def _fake_service_get_all_compute_by_host(self, ctx, host):
        self.compute = self._create_compute_node()
        self.service = self._create_service(host, compute=self.compute)
@ -412,6 +403,19 @@ class TrackerTestCase(BaseTrackerTestCase):
        self.assertFalse(self.tracker.disabled)
        self.assertTrue(self.updated)

+    def test_init(self):
+        self._assert(FAKE_VIRT_MEMORY_MB, 'memory_mb')
+        self._assert(FAKE_VIRT_LOCAL_GB, 'local_gb')
+        self._assert(FAKE_VIRT_VCPUS, 'vcpus')
+        self._assert(0, 'memory_mb_used')
+        self._assert(0, 'local_gb_used')
+        self._assert(0, 'vcpus_used')
+        self._assert(0, 'running_vms')
+        self._assert(FAKE_VIRT_MEMORY_MB, 'free_ram_mb')
+        self._assert(FAKE_VIRT_LOCAL_GB, 'free_disk_gb')
+        self.assertFalse(self.tracker.disabled)
+        self.assertEqual(0, self.tracker.compute_node['current_workload'])
+

 class InstanceClaimTestCase(BaseTrackerTestCase):

@ -817,3 +821,31 @@ class ResizeClaimTestCase(BaseTrackerTestCase):
        self.assertEqual('fakehost', instance['host'])
        self.assertEqual('fakehost', instance['launched_on'])
        self.assertEqual('fakenode', instance['node'])
+
+
+class OrphanTestCase(BaseTrackerTestCase):
+
+    def setUp(self):
+        super(OrphanTestCase, self).setUp()
+
+    def _driver(self):
+        class OrphanVirtDriver(FakeVirtDriver):
+            def get_per_instance_usage(self):
+                return {
+                    '1-2-3-4-5': {'memory_mb': 4, 'uuid': '1-2-3-4-5'},
+                    '2-3-4-5-6': {'memory_mb': 4, 'uuid': '2-3-4-5-6'},
+
+                }
+
+        return OrphanVirtDriver()
+
+    def test_usage(self):
+        # 2 instances, 4 mb each
+        self.assertEqual(8, self.tracker.compute_node['memory_mb_used'])
+
+    def test_find(self):
+        # create one legit instance and verify the 2 orphans remain
+        self._fake_instance()
+        orphans = self.tracker._find_orphaned_instances()
+
+        self.assertEqual(2, len(orphans))
--- a/nova/tests/test_xenapi.py
+++ b/nova/tests/test_xenapi.py
@ -1016,6 +1016,33 @@ class XenAPIVMTestCase(stubs.XenAPITestBase):
            pass
        self.assertTrue(was['called'])

+    def test_per_instance_usage_running(self):
+        instance = self._create_instance(spawn=True)
+        instance_type = instance_types.get_instance_type(3)
+
+        expected = {instance['uuid']: {'memory_mb': instance_type['memory_mb'],
+                                       'uuid': instance['uuid']}}
+        actual = self.conn.get_per_instance_usage()
+        self.assertEqual(expected, actual)
+
+        # Paused instances still consume resources:
+        self.conn.pause(instance)
+        actual = self.conn.get_per_instance_usage()
+        self.assertEqual(expected, actual)
+
+    def test_per_instance_usage_suspended(self):
+        # Suspended instances do not consume memory:
+        instance = self._create_instance(spawn=True)
+        self.conn.suspend(instance)
+        actual = self.conn.get_per_instance_usage()
+        self.assertEqual({}, actual)
+
+    def test_per_instance_usage_halted(self):
+        instance = self._create_instance(spawn=True)
+        self.conn.power_off(instance)
+        actual = self.conn.get_per_instance_usage()
+        self.assertEqual({}, actual)
+
    def _create_instance(self, instance_id=1, spawn=True):
        """Creates and spawns a test instance."""
        instance_values = {
--- a/nova/virt/driver.py
+++ b/nova/virt/driver.py
@ -767,6 +767,13 @@ class ComputeDriver(object):
            stats = [stats]
        return [s['hypervisor_hostname'] for s in stats]

+    def get_per_instance_usage(self):
+        """Get information about instance resource usage.
+
+        :returns: dict of  nova uuid => dict of usage info
+        """
+        return {}
+

 def load_compute_driver(virtapi, compute_driver=None):
    """Load a compute driver module.
--- a/nova/virt/xenapi/driver.py
+++ b/nova/virt/xenapi/driver.py
@ -607,6 +607,14 @@ class XenAPIDriver(driver.ComputeDriver):
        """resume guest state when a host is booted"""
        self._vmops.power_on(instance)

+    def get_per_instance_usage(self):
+        """Get information about instance resource usage.
+
+        :returns: dict of  nova uuid => dict of usage
+        info
+        """
+        return self._vmops.get_per_instance_usage()
+

 class XenAPISession(object):
    """The session to invoke XenAPI SDK calls"""
--- a/nova/virt/xenapi/fake.py
+++ b/nova/virt/xenapi/fake.py
@ -635,6 +635,14 @@ class SessionBase(object):
        db_ref['power_state'] = 'Halted'
    VM_clean_shutdown = VM_hard_shutdown

+    def VM_suspend(self, session, vm_ref):
+        db_ref = _db_content['VM'][vm_ref]
+        db_ref['power_state'] = 'Suspended'
+
+    def VM_pause(self, session, vm_ref):
+        db_ref = _db_content['VM'][vm_ref]
+        db_ref['power_state'] = 'Paused'
+
    def pool_eject(self, session, host_ref):
        pass

--- a/nova/virt/xenapi/vmops.py
+++ b/nova/virt/xenapi/vmops.py
@ -1639,3 +1639,24 @@ class VMOps(object):
            with excutils.save_and_reraise_exception():
                recover_method(context, instance, destination_hostname,
                               block_migration)
+
+    def get_per_instance_usage(self):
+        """Get usage info about each active instance."""
+        usage = {}
+
+        def _is_active(vm_rec):
+            power_state = vm_rec['power_state'].lower()
+            return power_state in ['running', 'paused']
+
+        def _get_uuid(vm_rec):
+            other_config = vm_rec['other_config']
+            return other_config.get('nova_uuid', None)
+
+        for vm_ref, vm_rec in vm_utils.list_vms(self._session):
+            uuid = _get_uuid(vm_rec)
+
+            if _is_active(vm_rec) and uuid is not None:
+                memory_mb = int(vm_rec['memory_static_max']) / 1024 / 1024
+                usage[uuid] = {'memory_mb': memory_mb, 'uuid': uuid}
+
+        return usage