libvirt: call get_capabilities() with all CPUs online
While we do cache the hosts's capabilities in self._caps in the libvirt Host object, if we happen to fist call get_capabilities() with some of our dedicated CPUs offline, libvirt erroneously reports them as being on socket 0 regardless of their real socket. We would then cache that topology, thus breaking pretty much all of our NUMA accounting. To fix this, this patch makes sure to call get_capabilities() immediately upon host init, and to power up all our dedicated CPUs before doing so. That way, we cache their real socket ID. For testing, because we don't really want to implement a libvirt bug in our Python libvirt fixture, we make due with a simple unit tests that asserts that init_host() has powered on the correct CPUs. Closes-bug: 2077228 Change-Id: I9a2a7614313297f11a55d99fb94916d3583a9504
This commit is contained in:
parent
7399728e89
commit
79d1f06094
@ -895,6 +895,16 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
)
|
||||
mock_supports.assert_called_once_with()
|
||||
|
||||
@mock.patch.object(hardware, 'get_cpu_dedicated_set',
|
||||
return_value=set([0, 42, 1337]))
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
'_register_all_undefined_instance_details')
|
||||
def test_init_host_topology(self, mock_get_cpu_dedicated_set, _):
|
||||
driver = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
with mock.patch.object(driver.cpu_api, 'power_up') as mock_power_up:
|
||||
driver.init_host('goat')
|
||||
mock_power_up.assert_called_with(set([0, 42, 1337]))
|
||||
|
||||
@mock.patch.object(
|
||||
libvirt_driver.LibvirtDriver,
|
||||
'_register_all_undefined_instance_details',
|
||||
|
@ -91,7 +91,7 @@ class API(object):
|
||||
"""
|
||||
return Core(i)
|
||||
|
||||
def _power_up(self, cpus: ty.Set[int]) -> None:
|
||||
def power_up(self, cpus: ty.Set[int]) -> None:
|
||||
if not CONF.libvirt.cpu_power_management:
|
||||
return
|
||||
cpu_dedicated_set = hardware.get_cpu_dedicated_set_nozero() or set()
|
||||
@ -111,7 +111,7 @@ class API(object):
|
||||
return
|
||||
pcpus = instance.numa_topology.cpu_pinning.union(
|
||||
instance.numa_topology.cpuset_reserved)
|
||||
self._power_up(pcpus)
|
||||
self.power_up(pcpus)
|
||||
|
||||
def power_up_for_migration(
|
||||
self, dst_numa_info: objects.LibvirtLiveMigrateNUMAInfo
|
||||
@ -121,7 +121,7 @@ class API(object):
|
||||
pcpus = dst_numa_info.emulator_pins
|
||||
for pins in dst_numa_info.cpu_pins.values():
|
||||
pcpus = pcpus.union(pins)
|
||||
self._power_up(pcpus)
|
||||
self.power_up(pcpus)
|
||||
|
||||
def _power_down(self, cpus: ty.Set[int]) -> None:
|
||||
if not CONF.libvirt.cpu_power_management:
|
||||
|
@ -751,12 +751,31 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
{'enabled': enabled, 'reason': reason})
|
||||
self._set_host_enabled(enabled, reason)
|
||||
|
||||
def _init_host_topology(self):
|
||||
"""To work around a bug in libvirt that reports offline CPUs as always
|
||||
being on socket 0 regardless of their real socket, power up all
|
||||
dedicated CPUs (the only ones whose socket we actually care about),
|
||||
then call get_capabilities() to initialize the topology with the
|
||||
correct socket values. get_capabilities()'s implementation will reuse
|
||||
these initial socket value, and avoid clobbering them with 0 for
|
||||
offline CPUs.
|
||||
"""
|
||||
cpus = hardware.get_cpu_dedicated_set()
|
||||
if cpus:
|
||||
self.cpu_api.power_up(cpus)
|
||||
self._host.get_capabilities()
|
||||
|
||||
def init_host(self, host):
|
||||
self._host.initialize()
|
||||
|
||||
self._update_host_specific_capabilities()
|
||||
|
||||
# NOTE(artom) Do this first to make sure our first call to
|
||||
# get_capabilities() happens with all dedicated CPUs online and caches
|
||||
# their correct socket ID. Unused dedicated CPUs will be powered down
|
||||
# further down in this method.
|
||||
self._check_cpu_set_configuration()
|
||||
self._init_host_topology()
|
||||
|
||||
self._update_host_specific_capabilities()
|
||||
|
||||
self._do_quality_warnings()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user