From 79d1f06094599249e6e30ebba2488b8b7a10834e Mon Sep 17 00:00:00 2001 From: Artom Lifshitz Date: Tue, 13 Aug 2024 11:29:10 -0400 Subject: [PATCH] libvirt: call get_capabilities() with all CPUs online While we do cache the hosts's capabilities in self._caps in the libvirt Host object, if we happen to fist call get_capabilities() with some of our dedicated CPUs offline, libvirt erroneously reports them as being on socket 0 regardless of their real socket. We would then cache that topology, thus breaking pretty much all of our NUMA accounting. To fix this, this patch makes sure to call get_capabilities() immediately upon host init, and to power up all our dedicated CPUs before doing so. That way, we cache their real socket ID. For testing, because we don't really want to implement a libvirt bug in our Python libvirt fixture, we make due with a simple unit tests that asserts that init_host() has powered on the correct CPUs. Closes-bug: 2077228 Change-Id: I9a2a7614313297f11a55d99fb94916d3583a9504 --- nova/tests/unit/virt/libvirt/test_driver.py | 10 +++++++++ nova/virt/libvirt/cpu/api.py | 6 +++--- nova/virt/libvirt/driver.py | 23 +++++++++++++++++++-- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index 8046a4f7efda..7fc7c51ec6cd 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -895,6 +895,16 @@ class LibvirtConnTestCase(test.NoDBTestCase, ) mock_supports.assert_called_once_with() + @mock.patch.object(hardware, 'get_cpu_dedicated_set', + return_value=set([0, 42, 1337])) + @mock.patch.object(libvirt_driver.LibvirtDriver, + '_register_all_undefined_instance_details') + def test_init_host_topology(self, mock_get_cpu_dedicated_set, _): + driver = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False) + with mock.patch.object(driver.cpu_api, 'power_up') as mock_power_up: + driver.init_host('goat') + mock_power_up.assert_called_with(set([0, 42, 1337])) + @mock.patch.object( libvirt_driver.LibvirtDriver, '_register_all_undefined_instance_details', diff --git a/nova/virt/libvirt/cpu/api.py b/nova/virt/libvirt/cpu/api.py index 104a3b4d277f..5d7dccd47ed1 100644 --- a/nova/virt/libvirt/cpu/api.py +++ b/nova/virt/libvirt/cpu/api.py @@ -91,7 +91,7 @@ class API(object): """ return Core(i) - def _power_up(self, cpus: ty.Set[int]) -> None: + def power_up(self, cpus: ty.Set[int]) -> None: if not CONF.libvirt.cpu_power_management: return cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set() @@ -111,7 +111,7 @@ class API(object): return pcpus = instance.numa_topology.cpu_pinning.union( instance.numa_topology.cpuset_reserved) - self._power_up(pcpus) + self.power_up(pcpus) def power_up_for_migration( self, dst_numa_info: objects.LibvirtLiveMigrateNUMAInfo @@ -121,7 +121,7 @@ class API(object): pcpus = dst_numa_info.emulator_pins for pins in dst_numa_info.cpu_pins.values(): pcpus = pcpus.union(pins) - self._power_up(pcpus) + self.power_up(pcpus) def _power_down(self, cpus: ty.Set[int]) -> None: if not CONF.libvirt.cpu_power_management: diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index e90c6284544e..4688d6848da7 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -751,12 +751,31 @@ class LibvirtDriver(driver.ComputeDriver): {'enabled': enabled, 'reason': reason}) self._set_host_enabled(enabled, reason) + def _init_host_topology(self): + """To work around a bug in libvirt that reports offline CPUs as always + being on socket 0 regardless of their real socket, power up all + dedicated CPUs (the only ones whose socket we actually care about), + then call get_capabilities() to initialize the topology with the + correct socket values. get_capabilities()'s implementation will reuse + these initial socket value, and avoid clobbering them with 0 for + offline CPUs. + """ + cpus = hardware.get_cpu_dedicated_set() + if cpus: + self.cpu_api.power_up(cpus) + self._host.get_capabilities() + def init_host(self, host): self._host.initialize() - self._update_host_specific_capabilities() - + # NOTE(artom) Do this first to make sure our first call to + # get_capabilities() happens with all dedicated CPUs online and caches + # their correct socket ID. Unused dedicated CPUs will be powered down + # further down in this method. self._check_cpu_set_configuration() + self._init_host_topology() + + self._update_host_specific_capabilities() self._do_quality_warnings()