Fix racey mdev init workaround
The original implementation is prone to racing with the nvidia driver so that mdevs are not registered by the time the script runs. Change-Id: I0dd8002a91d4026a71b8176610efac28ea50c33b Signed-off-by: Edward Hope-Morley <edward.hope-morley@canonical.com>
This commit is contained in:
@@ -1,6 +1,16 @@
|
|||||||
#!/bin/bash -e
|
#!/bin/bash -e
|
||||||
# Ensure all SRIOV devices have been setup
|
# Ensure all SRIOV devices have been setup
|
||||||
/usr/lib/nvidia/sriov-manage -e srvio-manage -e ALL
|
sleep 20
|
||||||
|
/usr/lib/nvidia/sriov-manage -e ALL
|
||||||
|
# Ensure mdev devices are registered before continuing
|
||||||
|
max=10
|
||||||
|
while true; do
|
||||||
|
if ! $(/usr/lib/nvidia/sriov-manage -e $(nvidia-smi -q | grep ^GPU| cut -d ' ' -f2-)| grep -q "already has VFs enabled."); then
|
||||||
|
echo "Waiting for GPU nvidia mdev registration"
|
||||||
|
sleep 1
|
||||||
|
((max--)) && continue
|
||||||
|
fi
|
||||||
|
break
|
||||||
|
done
|
||||||
# Now go through all domains and initialise any used mdevs
|
# Now go through all domains and initialise any used mdevs
|
||||||
/opt/remediate-nova-mdevs
|
/opt/remediate-nova-mdevs
|
||||||
|
|
||||||
|
|||||||
@@ -45,7 +45,6 @@ class PlacementHelper():
|
|||||||
"""
|
"""
|
||||||
Helper for Placement operations.
|
Helper for Placement operations.
|
||||||
"""
|
"""
|
||||||
DRIVER_TRAIT_MAPPING = {'nvidia-610': 'CUSTOM_VGPU_PLACEMENT'}
|
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.fqdn = socket.getfqdn()
|
self.fqdn = socket.getfqdn()
|
||||||
@@ -53,6 +52,10 @@ class PlacementHelper():
|
|||||||
if self.client is None:
|
if self.client is None:
|
||||||
raise PlacementError("failed to get placement client")
|
raise PlacementError("failed to get placement client")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def driver_trait_mapping(self):
|
||||||
|
return {mtype: 'CUSTOM_VGPU_PLACEMENT' for mtype in MDEV_TYPES}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_sdk_adapter_helper(service_type):
|
def _get_sdk_adapter_helper(service_type):
|
||||||
count = 1
|
count = 1
|
||||||
@@ -99,7 +102,7 @@ class PlacementHelper():
|
|||||||
if not _traits:
|
if not _traits:
|
||||||
raise PlacementError("no traits identified from the placement api")
|
raise PlacementError("no traits identified from the placement api")
|
||||||
|
|
||||||
for trait in self.DRIVER_TRAIT_MAPPING.values():
|
for trait in self.driver_trait_mapping.values():
|
||||||
if trait not in _traits['traits']:
|
if trait not in _traits['traits']:
|
||||||
raise PlacementError(f"trait {trait} not found in placement "
|
raise PlacementError(f"trait {trait} not found in placement "
|
||||||
"traits")
|
"traits")
|
||||||
@@ -170,7 +173,14 @@ class PlacementHelper():
|
|||||||
pci_id_parts = addr.split('_')
|
pci_id_parts = addr.split('_')
|
||||||
return get_pci_address(*pci_id_parts)
|
return get_pci_address(*pci_id_parts)
|
||||||
|
|
||||||
def update_gpu_traits(self, rpname, rpuuid, dry_run=False):
|
def update_gpu_traits(self, dry_run=False):
|
||||||
|
if not self.local_compute_rps:
|
||||||
|
return
|
||||||
|
|
||||||
|
for rp in self.local_compute_rps:
|
||||||
|
self.update_gpu_trait(rp['name'], rp['uuid'], dry_run)
|
||||||
|
|
||||||
|
def update_gpu_trait(self, rpname, rpuuid, dry_run=False):
|
||||||
LOG.info("updating gpu traits for resource provider %s", rpuuid)
|
LOG.info("updating gpu traits for resource provider %s", rpuuid)
|
||||||
traits = self.get_traits_for_rp(rpuuid)
|
traits = self.get_traits_for_rp(rpuuid)
|
||||||
if traits is None:
|
if traits is None:
|
||||||
@@ -187,12 +197,12 @@ class PlacementHelper():
|
|||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if driver not in self.DRIVER_TRAIT_MAPPING:
|
if driver not in self.driver_trait_mapping:
|
||||||
LOG.error("failed to map driver '%s' to a trait for PCI "
|
LOG.error("failed to map driver '%s' to a trait for PCI "
|
||||||
"address %s", driver, pci_address)
|
"address %s", driver, pci_address)
|
||||||
return
|
return
|
||||||
|
|
||||||
expected_traits = [self.DRIVER_TRAIT_MAPPING[driver]]
|
expected_traits = [self.driver_trait_mapping[driver]]
|
||||||
if expected_traits != traits['traits']:
|
if expected_traits != traits['traits']:
|
||||||
if dry_run:
|
if dry_run:
|
||||||
LOG.warning("rp %s for %s is mapped to driver %s but "
|
LOG.warning("rp %s for %s is mapped to driver %s but "
|
||||||
@@ -334,12 +344,7 @@ def main(dry_run=False):
|
|||||||
LOG.error(exc)
|
LOG.error(exc)
|
||||||
failed = True
|
failed = True
|
||||||
|
|
||||||
if not pm.local_compute_rps:
|
pm.update_gpu_traits(dry_run)
|
||||||
return
|
|
||||||
|
|
||||||
for rp in pm.local_compute_rps:
|
|
||||||
pm.update_gpu_traits(rp['name'], rp['uuid'], dry_run)
|
|
||||||
|
|
||||||
if failed:
|
if failed:
|
||||||
raise PlacementError("failed to update one or more placement traits")
|
raise PlacementError("failed to update one or more placement traits")
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
[Unit]
|
[Unit]
|
||||||
Description=GPU MDev Initialisation Workaround for OpenStack Nova
|
Description=GPU MDev Initialisation Workaround for OpenStack Nova
|
||||||
Before=nova-compute.service
|
Before=nova-compute.service
|
||||||
After=syslog.target network.target libvirtd.service nvidia-vgpu-mgr.service
|
After=syslog.target network.target
|
||||||
|
After=nvidia-vgpu-mgr.service nvidia-vgpud.service libvirtd.service
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
|
User=root
|
||||||
Environment="MDEV_INIT_DRY_RUN=False"
|
Environment="MDEV_INIT_DRY_RUN=False"
|
||||||
Type=oneshot
|
Type=oneshot
|
||||||
ExecStart=/bin/bash /opt/initialise_nova_mdevs.sh
|
ExecStart=/bin/bash /opt/initialise_nova_mdevs.sh
|
||||||
|
|||||||
Reference in New Issue
Block a user