Fix racey mdev init workaround
The original implementation is prone to racing with the nvidia driver so that mdevs are not registered by the time the script runs. Change-Id: I0dd8002a91d4026a71b8176610efac28ea50c33b Signed-off-by: Edward Hope-Morley <edward.hope-morley@canonical.com>
This commit is contained in:
@@ -1,6 +1,16 @@
|
||||
#!/bin/bash -e
|
||||
# Ensure all SRIOV devices have been setup
|
||||
/usr/lib/nvidia/sriov-manage -e srvio-manage -e ALL
|
||||
sleep 20
|
||||
/usr/lib/nvidia/sriov-manage -e ALL
|
||||
# Ensure mdev devices are registered before continuing
|
||||
max=10
|
||||
while true; do
|
||||
if ! $(/usr/lib/nvidia/sriov-manage -e $(nvidia-smi -q | grep ^GPU| cut -d ' ' -f2-)| grep -q "already has VFs enabled."); then
|
||||
echo "Waiting for GPU nvidia mdev registration"
|
||||
sleep 1
|
||||
((max--)) && continue
|
||||
fi
|
||||
break
|
||||
done
|
||||
# Now go through all domains and initialise any used mdevs
|
||||
/opt/remediate-nova-mdevs
|
||||
|
||||
|
||||
@@ -45,7 +45,6 @@ class PlacementHelper():
|
||||
"""
|
||||
Helper for Placement operations.
|
||||
"""
|
||||
DRIVER_TRAIT_MAPPING = {'nvidia-610': 'CUSTOM_VGPU_PLACEMENT'}
|
||||
|
||||
def __init__(self):
|
||||
self.fqdn = socket.getfqdn()
|
||||
@@ -53,6 +52,10 @@ class PlacementHelper():
|
||||
if self.client is None:
|
||||
raise PlacementError("failed to get placement client")
|
||||
|
||||
@property
|
||||
def driver_trait_mapping(self):
|
||||
return {mtype: 'CUSTOM_VGPU_PLACEMENT' for mtype in MDEV_TYPES}
|
||||
|
||||
@staticmethod
|
||||
def _get_sdk_adapter_helper(service_type):
|
||||
count = 1
|
||||
@@ -99,7 +102,7 @@ class PlacementHelper():
|
||||
if not _traits:
|
||||
raise PlacementError("no traits identified from the placement api")
|
||||
|
||||
for trait in self.DRIVER_TRAIT_MAPPING.values():
|
||||
for trait in self.driver_trait_mapping.values():
|
||||
if trait not in _traits['traits']:
|
||||
raise PlacementError(f"trait {trait} not found in placement "
|
||||
"traits")
|
||||
@@ -170,7 +173,14 @@ class PlacementHelper():
|
||||
pci_id_parts = addr.split('_')
|
||||
return get_pci_address(*pci_id_parts)
|
||||
|
||||
def update_gpu_traits(self, rpname, rpuuid, dry_run=False):
|
||||
def update_gpu_traits(self, dry_run=False):
|
||||
if not self.local_compute_rps:
|
||||
return
|
||||
|
||||
for rp in self.local_compute_rps:
|
||||
self.update_gpu_trait(rp['name'], rp['uuid'], dry_run)
|
||||
|
||||
def update_gpu_trait(self, rpname, rpuuid, dry_run=False):
|
||||
LOG.info("updating gpu traits for resource provider %s", rpuuid)
|
||||
traits = self.get_traits_for_rp(rpuuid)
|
||||
if traits is None:
|
||||
@@ -187,12 +197,12 @@ class PlacementHelper():
|
||||
|
||||
return
|
||||
|
||||
if driver not in self.DRIVER_TRAIT_MAPPING:
|
||||
if driver not in self.driver_trait_mapping:
|
||||
LOG.error("failed to map driver '%s' to a trait for PCI "
|
||||
"address %s", driver, pci_address)
|
||||
return
|
||||
|
||||
expected_traits = [self.DRIVER_TRAIT_MAPPING[driver]]
|
||||
expected_traits = [self.driver_trait_mapping[driver]]
|
||||
if expected_traits != traits['traits']:
|
||||
if dry_run:
|
||||
LOG.warning("rp %s for %s is mapped to driver %s but "
|
||||
@@ -334,12 +344,7 @@ def main(dry_run=False):
|
||||
LOG.error(exc)
|
||||
failed = True
|
||||
|
||||
if not pm.local_compute_rps:
|
||||
return
|
||||
|
||||
for rp in pm.local_compute_rps:
|
||||
pm.update_gpu_traits(rp['name'], rp['uuid'], dry_run)
|
||||
|
||||
pm.update_gpu_traits(dry_run)
|
||||
if failed:
|
||||
raise PlacementError("failed to update one or more placement traits")
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
[Unit]
|
||||
Description=GPU MDev Initialisation Workaround for OpenStack Nova
|
||||
Before=nova-compute.service
|
||||
After=syslog.target network.target libvirtd.service nvidia-vgpu-mgr.service
|
||||
After=syslog.target network.target
|
||||
After=nvidia-vgpu-mgr.service nvidia-vgpud.service libvirtd.service
|
||||
|
||||
[Service]
|
||||
User=root
|
||||
Environment="MDEV_INIT_DRY_RUN=False"
|
||||
Type=oneshot
|
||||
ExecStart=/bin/bash /opt/initialise_nova_mdevs.sh
|
||||
|
||||
Reference in New Issue
Block a user