Fix racey mdev init workaround

The original implementation is prone to racing with
the nvidia driver so that mdevs are not registered by
the time the script runs.

Change-Id: I0dd8002a91d4026a71b8176610efac28ea50c33b
Signed-off-by: Edward Hope-Morley <edward.hope-morley@canonical.com>
This commit is contained in:
Edward Hope-Morley
2025-09-26 22:14:18 +01:00
parent c87c8bf7b8
commit 1b992b26e6
3 changed files with 31 additions and 14 deletions

View File

@@ -1,6 +1,16 @@
#!/bin/bash -e
# Ensure all SRIOV devices have been setup
/usr/lib/nvidia/sriov-manage -e srvio-manage -e ALL
sleep 20
/usr/lib/nvidia/sriov-manage -e ALL
# Ensure mdev devices are registered before continuing
max=10
while true; do
if ! $(/usr/lib/nvidia/sriov-manage -e $(nvidia-smi -q | grep ^GPU| cut -d ' ' -f2-)| grep -q "already has VFs enabled."); then
echo "Waiting for GPU nvidia mdev registration"
sleep 1
((max--)) && continue
fi
break
done
# Now go through all domains and initialise any used mdevs
/opt/remediate-nova-mdevs

View File

@@ -45,7 +45,6 @@ class PlacementHelper():
"""
Helper for Placement operations.
"""
DRIVER_TRAIT_MAPPING = {'nvidia-610': 'CUSTOM_VGPU_PLACEMENT'}
def __init__(self):
self.fqdn = socket.getfqdn()
@@ -53,6 +52,10 @@ class PlacementHelper():
if self.client is None:
raise PlacementError("failed to get placement client")
@property
def driver_trait_mapping(self):
return {mtype: 'CUSTOM_VGPU_PLACEMENT' for mtype in MDEV_TYPES}
@staticmethod
def _get_sdk_adapter_helper(service_type):
count = 1
@@ -99,7 +102,7 @@ class PlacementHelper():
if not _traits:
raise PlacementError("no traits identified from the placement api")
for trait in self.DRIVER_TRAIT_MAPPING.values():
for trait in self.driver_trait_mapping.values():
if trait not in _traits['traits']:
raise PlacementError(f"trait {trait} not found in placement "
"traits")
@@ -170,7 +173,14 @@ class PlacementHelper():
pci_id_parts = addr.split('_')
return get_pci_address(*pci_id_parts)
def update_gpu_traits(self, rpname, rpuuid, dry_run=False):
def update_gpu_traits(self, dry_run=False):
if not self.local_compute_rps:
return
for rp in self.local_compute_rps:
self.update_gpu_trait(rp['name'], rp['uuid'], dry_run)
def update_gpu_trait(self, rpname, rpuuid, dry_run=False):
LOG.info("updating gpu traits for resource provider %s", rpuuid)
traits = self.get_traits_for_rp(rpuuid)
if traits is None:
@@ -187,12 +197,12 @@ class PlacementHelper():
return
if driver not in self.DRIVER_TRAIT_MAPPING:
if driver not in self.driver_trait_mapping:
LOG.error("failed to map driver '%s' to a trait for PCI "
"address %s", driver, pci_address)
return
expected_traits = [self.DRIVER_TRAIT_MAPPING[driver]]
expected_traits = [self.driver_trait_mapping[driver]]
if expected_traits != traits['traits']:
if dry_run:
LOG.warning("rp %s for %s is mapped to driver %s but "
@@ -334,12 +344,7 @@ def main(dry_run=False):
LOG.error(exc)
failed = True
if not pm.local_compute_rps:
return
for rp in pm.local_compute_rps:
pm.update_gpu_traits(rp['name'], rp['uuid'], dry_run)
pm.update_gpu_traits(dry_run)
if failed:
raise PlacementError("failed to update one or more placement traits")

View File

@@ -1,9 +1,11 @@
[Unit]
Description=GPU MDev Initialisation Workaround for OpenStack Nova
Before=nova-compute.service
After=syslog.target network.target libvirtd.service nvidia-vgpu-mgr.service
After=syslog.target network.target
After=nvidia-vgpu-mgr.service nvidia-vgpud.service libvirtd.service
[Service]
User=root
Environment="MDEV_INIT_DRY_RUN=False"
Type=oneshot
ExecStart=/bin/bash /opt/initialise_nova_mdevs.sh