diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py index 7306b5940a9f..82f8ec461bf6 100644 --- a/nova/compute/resource_tracker.py +++ b/nova/compute/resource_tracker.py @@ -268,6 +268,11 @@ class ResourceTracker(object): self._update_usage_from_migrations(resources, migrations) + # Detect and account for orphaned instances that may exist on the + # hypervisor, but are not in the DB: + orphans = self._find_orphaned_instances() + self._update_usage_from_orphans(resources, orphans) + self._report_final_resource_view(resources) self._sync_compute_node(context, resources) @@ -364,8 +369,8 @@ class ResourceTracker(object): def _update_usage(self, resources, usage, sign=1): resources['memory_mb_used'] += sign * usage['memory_mb'] - resources['local_gb_used'] += sign * usage['root_gb'] - resources['local_gb_used'] += sign * usage['ephemeral_gb'] + resources['local_gb_used'] += sign * usage.get('root_gb', 0) + resources['local_gb_used'] += sign * usage.get('ephemeral_gb', 0) # free ram and disk may be negative, depending on policy: resources['free_ram_mb'] = (resources['memory_mb'] - @@ -501,6 +506,40 @@ class ResourceTracker(object): for instance in instances: self._update_usage_from_instance(resources, instance) + def _find_orphaned_instances(self): + """Given the set of instances and migrations already account for + by resource tracker, sanity check the hypervisor to determine + if there are any "orphaned" instances left hanging around. + + Orphans could be consuming memory and should be accounted for in + usage calculations to guard against potential out of memory + errors. + """ + uuids1 = frozenset(self.tracked_instances.keys()) + uuids2 = frozenset(self.tracked_migrations.keys()) + uuids = uuids1 | uuids2 + + usage = self.driver.get_per_instance_usage() + vuuids = frozenset(usage.keys()) + + orphan_uuids = vuuids - uuids + orphans = [usage[uuid] for uuid in orphan_uuids] + + return orphans + + def _update_usage_from_orphans(self, resources, orphans): + """Include orphaned instances in usage.""" + for orphan in orphans: + uuid = orphan['uuid'] + memory_mb = orphan['memory_mb'] + + LOG.warn(_("Detected running orphan instance: %(uuid)s (consuming " + "%(memory_mb)s MB memory") % locals()) + + # just record memory usage for the orphan + usage = {'memory_mb': orphan['memory_mb']} + self._update_usage(resources, usage) + def _verify_resources(self, resources): resource_keys = ["vcpus", "memory_mb", "local_gb", "cpu_info", "vcpus_used", "memory_mb_used", "local_gb_used"] diff --git a/nova/tests/compute/test_resource_tracker.py b/nova/tests/compute/test_resource_tracker.py index 9bad14275a9b..9cc2355791af 100644 --- a/nova/tests/compute/test_resource_tracker.py +++ b/nova/tests/compute/test_resource_tracker.py @@ -193,17 +193,17 @@ class BaseTestCase(test.TestCase): # only used in the subsequent notification: return (instance, instance) - def _tracker(self, host=None, unsupported=False): + def _driver(self): + return FakeVirtDriver() + + def _tracker(self, host=None): if host is None: host = self.host node = "fakenode" - if unsupported: - driver = UnsupportedVirtDriver() - else: - driver = FakeVirtDriver() + driver = self._driver() tracker = resource_tracker.ResourceTracker(host, driver, node) return tracker @@ -215,10 +215,13 @@ class UnsupportedDriverTestCase(BaseTestCase): """ def setUp(self): super(UnsupportedDriverTestCase, self).setUp() - self.tracker = self._tracker(unsupported=True) + self.tracker = self._tracker() # seed tracker with data: self.tracker.update_available_resource(self.context) + def _driver(self): + return UnsupportedVirtDriver() + def test_disabled(self): # disabled = no compute node stats self.assertTrue(self.tracker.disabled) @@ -248,7 +251,7 @@ class UnsupportedDriverTestCase(BaseTestCase): root_gb=10) self.tracker.update_usage(self.context, instance) - def testDisabledResizeClaim(self): + def test_disabled_resize_claim(self): instance = self._fake_instance() instance_type = self._fake_instance_type_create() claim = self.tracker.resize_claim(self.context, instance, @@ -258,7 +261,7 @@ class UnsupportedDriverTestCase(BaseTestCase): self.assertEqual(instance_type['id'], claim.migration['new_instance_type_id']) - def testDisabledResizeContextClaim(self): + def test_disabled_resize_context_claim(self): instance = self._fake_instance() instance_type = self._fake_instance_type_create() with self.tracker.resize_claim(self.context, instance, instance_type) \ @@ -327,18 +330,6 @@ class BaseTrackerTestCase(BaseTestCase): self.tracker.update_available_resource(self.context) self.limits = self._limits() - self._assert(FAKE_VIRT_MEMORY_MB, 'memory_mb') - self._assert(FAKE_VIRT_LOCAL_GB, 'local_gb') - self._assert(FAKE_VIRT_VCPUS, 'vcpus') - self._assert(0, 'memory_mb_used') - self._assert(0, 'local_gb_used') - self._assert(0, 'vcpus_used') - self._assert(0, 'running_vms') - self._assert(FAKE_VIRT_MEMORY_MB, 'free_ram_mb') - self._assert(FAKE_VIRT_LOCAL_GB, 'free_disk_gb') - self.assertFalse(self.tracker.disabled) - self.assertEqual(0, self.tracker.compute_node['current_workload']) - def _fake_service_get_all_compute_by_host(self, ctx, host): self.compute = self._create_compute_node() self.service = self._create_service(host, compute=self.compute) @@ -412,6 +403,19 @@ class TrackerTestCase(BaseTrackerTestCase): self.assertFalse(self.tracker.disabled) self.assertTrue(self.updated) + def test_init(self): + self._assert(FAKE_VIRT_MEMORY_MB, 'memory_mb') + self._assert(FAKE_VIRT_LOCAL_GB, 'local_gb') + self._assert(FAKE_VIRT_VCPUS, 'vcpus') + self._assert(0, 'memory_mb_used') + self._assert(0, 'local_gb_used') + self._assert(0, 'vcpus_used') + self._assert(0, 'running_vms') + self._assert(FAKE_VIRT_MEMORY_MB, 'free_ram_mb') + self._assert(FAKE_VIRT_LOCAL_GB, 'free_disk_gb') + self.assertFalse(self.tracker.disabled) + self.assertEqual(0, self.tracker.compute_node['current_workload']) + class InstanceClaimTestCase(BaseTrackerTestCase): @@ -817,3 +821,31 @@ class ResizeClaimTestCase(BaseTrackerTestCase): self.assertEqual('fakehost', instance['host']) self.assertEqual('fakehost', instance['launched_on']) self.assertEqual('fakenode', instance['node']) + + +class OrphanTestCase(BaseTrackerTestCase): + + def setUp(self): + super(OrphanTestCase, self).setUp() + + def _driver(self): + class OrphanVirtDriver(FakeVirtDriver): + def get_per_instance_usage(self): + return { + '1-2-3-4-5': {'memory_mb': 4, 'uuid': '1-2-3-4-5'}, + '2-3-4-5-6': {'memory_mb': 4, 'uuid': '2-3-4-5-6'}, + + } + + return OrphanVirtDriver() + + def test_usage(self): + # 2 instances, 4 mb each + self.assertEqual(8, self.tracker.compute_node['memory_mb_used']) + + def test_find(self): + # create one legit instance and verify the 2 orphans remain + self._fake_instance() + orphans = self.tracker._find_orphaned_instances() + + self.assertEqual(2, len(orphans)) diff --git a/nova/tests/test_xenapi.py b/nova/tests/test_xenapi.py index c49664aa8dce..f2799b8f368c 100644 --- a/nova/tests/test_xenapi.py +++ b/nova/tests/test_xenapi.py @@ -1016,6 +1016,33 @@ class XenAPIVMTestCase(stubs.XenAPITestBase): pass self.assertTrue(was['called']) + def test_per_instance_usage_running(self): + instance = self._create_instance(spawn=True) + instance_type = instance_types.get_instance_type(3) + + expected = {instance['uuid']: {'memory_mb': instance_type['memory_mb'], + 'uuid': instance['uuid']}} + actual = self.conn.get_per_instance_usage() + self.assertEqual(expected, actual) + + # Paused instances still consume resources: + self.conn.pause(instance) + actual = self.conn.get_per_instance_usage() + self.assertEqual(expected, actual) + + def test_per_instance_usage_suspended(self): + # Suspended instances do not consume memory: + instance = self._create_instance(spawn=True) + self.conn.suspend(instance) + actual = self.conn.get_per_instance_usage() + self.assertEqual({}, actual) + + def test_per_instance_usage_halted(self): + instance = self._create_instance(spawn=True) + self.conn.power_off(instance) + actual = self.conn.get_per_instance_usage() + self.assertEqual({}, actual) + def _create_instance(self, instance_id=1, spawn=True): """Creates and spawns a test instance.""" instance_values = { diff --git a/nova/virt/driver.py b/nova/virt/driver.py index 991a0f6cefc0..cb72aa2dc2a5 100644 --- a/nova/virt/driver.py +++ b/nova/virt/driver.py @@ -767,6 +767,13 @@ class ComputeDriver(object): stats = [stats] return [s['hypervisor_hostname'] for s in stats] + def get_per_instance_usage(self): + """Get information about instance resource usage. + + :returns: dict of nova uuid => dict of usage info + """ + return {} + def load_compute_driver(virtapi, compute_driver=None): """Load a compute driver module. diff --git a/nova/virt/xenapi/driver.py b/nova/virt/xenapi/driver.py index 1649ffb479f3..8e9e74d027ce 100644 --- a/nova/virt/xenapi/driver.py +++ b/nova/virt/xenapi/driver.py @@ -607,6 +607,14 @@ class XenAPIDriver(driver.ComputeDriver): """resume guest state when a host is booted""" self._vmops.power_on(instance) + def get_per_instance_usage(self): + """Get information about instance resource usage. + + :returns: dict of nova uuid => dict of usage + info + """ + return self._vmops.get_per_instance_usage() + class XenAPISession(object): """The session to invoke XenAPI SDK calls""" diff --git a/nova/virt/xenapi/fake.py b/nova/virt/xenapi/fake.py index db4f5d03e28f..9af8a9f411d4 100644 --- a/nova/virt/xenapi/fake.py +++ b/nova/virt/xenapi/fake.py @@ -635,6 +635,14 @@ class SessionBase(object): db_ref['power_state'] = 'Halted' VM_clean_shutdown = VM_hard_shutdown + def VM_suspend(self, session, vm_ref): + db_ref = _db_content['VM'][vm_ref] + db_ref['power_state'] = 'Suspended' + + def VM_pause(self, session, vm_ref): + db_ref = _db_content['VM'][vm_ref] + db_ref['power_state'] = 'Paused' + def pool_eject(self, session, host_ref): pass diff --git a/nova/virt/xenapi/vmops.py b/nova/virt/xenapi/vmops.py index ee466b998cc2..8d4687fe8165 100644 --- a/nova/virt/xenapi/vmops.py +++ b/nova/virt/xenapi/vmops.py @@ -1639,3 +1639,24 @@ class VMOps(object): with excutils.save_and_reraise_exception(): recover_method(context, instance, destination_hostname, block_migration) + + def get_per_instance_usage(self): + """Get usage info about each active instance.""" + usage = {} + + def _is_active(vm_rec): + power_state = vm_rec['power_state'].lower() + return power_state in ['running', 'paused'] + + def _get_uuid(vm_rec): + other_config = vm_rec['other_config'] + return other_config.get('nova_uuid', None) + + for vm_ref, vm_rec in vm_utils.list_vms(self._session): + uuid = _get_uuid(vm_rec) + + if _is_active(vm_rec) and uuid is not None: + memory_mb = int(vm_rec['memory_static_max']) / 1024 / 1024 + usage[uuid] = {'memory_mb': memory_mb, 'uuid': uuid} + + return usage