libvirt: call get_capabilities() with all CPUs online

While we do cache the hosts's capabilities in self._caps in the
libvirt Host object, if we happen to fist call get_capabilities() with
some of our dedicated CPUs offline, libvirt erroneously reports them
as being on socket 0 regardless of their real socket. We would then
cache that topology, thus breaking pretty much all of our NUMA
accounting.

To fix this, this patch makes sure to call get_capabilities()
immediately upon host init, and to power up all our dedicated CPUs
before doing so. That way, we cache their real socket ID.

For testing, because we don't really want to implement a libvirt bug
in our Python libvirt fixture, we make due with a simple unit tests
that asserts that init_host() has powered on the correct CPUs.

Closes-bug: 2077228
Change-Id: I9a2a7614313297f11a55d99fb94916d3583a9504
This commit is contained in:
Artom Lifshitz 2024-08-13 11:29:10 -04:00
parent 7399728e89
commit 79d1f06094
3 changed files with 34 additions and 5 deletions

View File

@ -895,6 +895,16 @@ class LibvirtConnTestCase(test.NoDBTestCase,
)
mock_supports.assert_called_once_with()
@mock.patch.object(hardware, 'get_cpu_dedicated_set',
return_value=set([0, 42, 1337]))
@mock.patch.object(libvirt_driver.LibvirtDriver,
'_register_all_undefined_instance_details')
def test_init_host_topology(self, mock_get_cpu_dedicated_set, _):
driver = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
with mock.patch.object(driver.cpu_api, 'power_up') as mock_power_up:
driver.init_host('goat')
mock_power_up.assert_called_with(set([0, 42, 1337]))
@mock.patch.object(
libvirt_driver.LibvirtDriver,
'_register_all_undefined_instance_details',

View File

@ -91,7 +91,7 @@ class API(object):
"""
return Core(i)
def _power_up(self, cpus: ty.Set[int]) -> None:
def power_up(self, cpus: ty.Set[int]) -> None:
if not CONF.libvirt.cpu_power_management:
return
cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set()
@ -111,7 +111,7 @@ class API(object):
return
pcpus = instance.numa_topology.cpu_pinning.union(
instance.numa_topology.cpuset_reserved)
self._power_up(pcpus)
self.power_up(pcpus)
def power_up_for_migration(
self, dst_numa_info: objects.LibvirtLiveMigrateNUMAInfo
@ -121,7 +121,7 @@ class API(object):
pcpus = dst_numa_info.emulator_pins
for pins in dst_numa_info.cpu_pins.values():
pcpus = pcpus.union(pins)
self._power_up(pcpus)
self.power_up(pcpus)
def _power_down(self, cpus: ty.Set[int]) -> None:
if not CONF.libvirt.cpu_power_management:

View File

@ -751,12 +751,31 @@ class LibvirtDriver(driver.ComputeDriver):
{'enabled': enabled, 'reason': reason})
self._set_host_enabled(enabled, reason)
def _init_host_topology(self):
"""To work around a bug in libvirt that reports offline CPUs as always
being on socket 0 regardless of their real socket, power up all
dedicated CPUs (the only ones whose socket we actually care about),
then call get_capabilities() to initialize the topology with the
correct socket values. get_capabilities()'s implementation will reuse
these initial socket value, and avoid clobbering them with 0 for
offline CPUs.
"""
cpus = hardware.get_cpu_dedicated_set()
if cpus:
self.cpu_api.power_up(cpus)
self._host.get_capabilities()
def init_host(self, host):
self._host.initialize()
self._update_host_specific_capabilities()
# NOTE(artom) Do this first to make sure our first call to
# get_capabilities() happens with all dedicated CPUs online and caches
# their correct socket ID. Unused dedicated CPUs will be powered down
# further down in this method.
self._check_cpu_set_configuration()
self._init_host_topology()
self._update_host_specific_capabilities()
self._do_quality_warnings()