Update sriov interfaces to discover pci_addresses prior to host-unlock

In order to generate up-to-date sriov helm overrides
prior to host-unlock:
    trigger getting ports inventory update when sriov is configured
    prevent host-unlock when configured sriov vfs mismatches inventoried
    allow host-unlock force option to override

Tests Performed:
    Containers verify AIO lab with sriov interfaces
    Update sriov interface and verify pci addresses updated
    Update sriov interface and verify host-unlock blocked
        until pci addresses are updated
    Sanity Compute lab with sriov interfaces

Change-Id: Iba243a6389c2d7b6cd0b1b016c257534725e30eb
Story: 2003909
Task: 29731
Signed-off-by: John Kung <john.kung@windriver.com>
This commit is contained in:
John Kung 2019-02-27 15:30:01 -05:00
parent 281e1f1109
commit f8fc051a9b
9 changed files with 240 additions and 116 deletions

View File

@ -555,21 +555,10 @@ class AgentManager(service.PeriodicService):
fcntl.flock(lockfd, fcntl.LOCK_UN)
os.close(lockfd)
def ihost_inv_get_and_report(self, icontext):
"""Collect data for an ihost.
def _get_ports_inventory(self):
"""Collect ports inventory for this host"""
This method allows an ihost data to be collected.
:param: icontext: an admin context
:returns: updated ihost object, including all fields.
"""
rpcapi = conductor_rpcapi.ConductorAPI(
topic=conductor_rpcapi.MANAGER_TOPIC)
ihost = None
# find list of network related inics for this ihost
# find list of network related inics for this host
inics = self._ipci_operator.inics_get()
# create an array of ports for each net entry of the NIC device
@ -591,24 +580,132 @@ class AgentManager(service.PeriodicService):
# create an array of pci_devs for each net entry of the device
pci_devs = []
for pci_dev in pci_devices:
pci_dev_array = self._ipci_operator.pci_get_device_attrs(pci_dev.pciaddr)
pci_dev_array = self._ipci_operator.pci_get_device_attrs(
pci_dev.pciaddr)
for dev in pci_dev_array:
pci_devs.append(pci.PCIDevice(pci_dev, **dev))
# create a list of MAC addresses that will be used to identify the
# inventoried host (one of the MACs should be the management MAC)
ihost_macs = [port.mac for port in iports if port.mac]
host_macs = [port.mac for port in iports if port.mac]
port_list = []
for port in iports:
inic_dict = {'pciaddr': port.ipci.pciaddr,
'pclass': port.ipci.pclass,
'pvendor': port.ipci.pvendor,
'pdevice': port.ipci.pdevice,
'prevision': port.ipci.prevision,
'psvendor': port.ipci.psvendor,
'psdevice': port.ipci.psdevice,
'pname': port.name,
'numa_node': port.numa_node,
'sriov_totalvfs': port.sriov_totalvfs,
'sriov_numvfs': port.sriov_numvfs,
'sriov_vfs_pci_address': port.sriov_vfs_pci_address,
'driver': port.driver,
'mac': port.mac,
'mtu': port.mtu,
'speed': port.speed,
'link_mode': port.link_mode,
'dev_id': port.dev_id,
'dpdksupport': port.dpdksupport}
LOG.debug('Sysinv Agent inic {}'.format(inic_dict))
port_list.append(inic_dict)
pci_device_list = []
for dev in pci_devs:
pci_dev_dict = {'name': dev.name,
'pciaddr': dev.pci.pciaddr,
'pclass_id': dev.pclass_id,
'pvendor_id': dev.pvendor_id,
'pdevice_id': dev.pdevice_id,
'pclass': dev.pci.pclass,
'pvendor': dev.pci.pvendor,
'pdevice': dev.pci.pdevice,
'prevision': dev.pci.prevision,
'psvendor': dev.pci.psvendor,
'psdevice': dev.pci.psdevice,
'numa_node': dev.numa_node,
'sriov_totalvfs': dev.sriov_totalvfs,
'sriov_numvfs': dev.sriov_numvfs,
'sriov_vfs_pci_address': dev.sriov_vfs_pci_address,
'driver': dev.driver,
'enabled': dev.enabled,
'extra_info': dev.extra_info}
LOG.debug('Sysinv Agent dev {}'.format(pci_dev_dict))
pci_device_list.append(pci_dev_dict)
return port_list, pci_device_list, host_macs
def _retry_on_missing_host_uuid(ex):
LOG.info('Caught missing host_uuid exception. Retrying... '
'Exception: {}'.format(ex))
return isinstance(ex, exception.LocalHostUUIDNotFound)
@retrying.retry(wait_fixed=15 * 1000, stop_max_delay=300 * 1000,
retry_on_exception=_retry_on_missing_host_uuid)
def _report_port_inventory(self, context, rpcapi=None,
port_list=None, pci_device_list=None):
host_uuid = self._ihost_uuid
if not host_uuid:
raise exception.LocalHostUUIDNotFound()
if rpcapi is None:
rpcapi = conductor_rpcapi.ConductorAPI(
topic=conductor_rpcapi.MANAGER_TOPIC)
if pci_device_list is None or port_list is None:
port_list, pci_device_list, host_macs = self._get_ports_inventory()
try:
rpcapi.iport_update_by_ihost(context,
host_uuid,
port_list)
except RemoteError as e:
LOG.error("iport_update_by_ihost RemoteError exc_type=%s" %
e.exc_type)
self._report_to_conductor = False
except exception.SysinvException:
LOG.exception("Sysinv Agent exception updating port.")
pass
try:
rpcapi.pci_device_update_by_host(context,
host_uuid,
pci_device_list)
except exception.SysinvException:
LOG.exception("Sysinv Agent exception updating pci_device.")
pass
def ihost_inv_get_and_report(self, icontext):
"""Collect data for an ihost.
This method allows an ihost data to be collected.
:param: icontext: an admin context
:returns: updated ihost object, including all fields.
"""
ihost = None
rpcapi = conductor_rpcapi.ConductorAPI(
topic=conductor_rpcapi.MANAGER_TOPIC)
port_list, pci_device_list, host_macs = self._get_ports_inventory()
# get my ihost record which should be avail since booted
LOG.debug('Sysinv Agent iports={}, ihost_macs={}'.format(
iports, ihost_macs))
LOG.debug('Sysinv Agent host_macs={} '.format(
host_macs))
slept = 0
while slept < MAXSLEEP:
# wait for controller to come up first may be a DOR
try:
ihost = rpcapi.get_ihost_by_macs(icontext, ihost_macs)
ihost = rpcapi.get_ihost_by_macs(icontext, host_macs)
except Timeout:
LOG.info("get_ihost_by_macs rpc Timeout.")
return # wait for next audit cycle
@ -676,7 +773,6 @@ class AgentManager(service.PeriodicService):
pass
subfunctions = self.subfunctions_get()
try:
rpcapi.subfunctions_update_by_ihost(icontext,
ihost['uuid'],
@ -686,86 +782,8 @@ class AgentManager(service.PeriodicService):
"conductor.")
pass
# post to sysinv db by ihost['uuid']
iport_dict_array = []
for port in iports:
inic_dict = {'pciaddr': port.ipci.pciaddr,
'pclass': port.ipci.pclass,
'pvendor': port.ipci.pvendor,
'pdevice': port.ipci.pdevice,
'prevision': port.ipci.prevision,
'psvendor': port.ipci.psvendor,
'psdevice': port.ipci.psdevice,
'pname': port.name,
'numa_node': port.numa_node,
'sriov_totalvfs': port.sriov_totalvfs,
'sriov_numvfs': port.sriov_numvfs,
'sriov_vfs_pci_address': port.sriov_vfs_pci_address,
'driver': port.driver,
'mac': port.mac,
'mtu': port.mtu,
'speed': port.speed,
'link_mode': port.link_mode,
'dev_id': port.dev_id,
'dpdksupport': port.dpdksupport}
LOG.debug('Sysinv Agent inic {}'.format(inic_dict))
iport_dict_array.append(inic_dict)
try:
# may get duplicate key if already sent on earlier init
rpcapi.iport_update_by_ihost(icontext,
ihost['uuid'],
iport_dict_array)
except RemoteError as e:
LOG.error("iport_update_by_ihost RemoteError exc_type=%s" %
e.exc_type)
self._report_to_conductor = False
except exception.SysinvException:
LOG.exception("Sysinv Agent exception updating iport conductor.")
pass
try:
rpcapi.subfunctions_update_by_ihost(icontext,
ihost['uuid'],
subfunctions)
except exception.SysinvException:
LOG.exception("Sysinv Agent exception updating subfunctions "
"conductor.")
pass
# post to sysinv db by ihost['uuid']
pci_device_dict_array = []
for dev in pci_devs:
pci_dev_dict = {'name': dev.name,
'pciaddr': dev.pci.pciaddr,
'pclass_id': dev.pclass_id,
'pvendor_id': dev.pvendor_id,
'pdevice_id': dev.pdevice_id,
'pclass': dev.pci.pclass,
'pvendor': dev.pci.pvendor,
'pdevice': dev.pci.pdevice,
'prevision': dev.pci.prevision,
'psvendor': dev.pci.psvendor,
'psdevice': dev.pci.psdevice,
'numa_node': dev.numa_node,
'sriov_totalvfs': dev.sriov_totalvfs,
'sriov_numvfs': dev.sriov_numvfs,
'sriov_vfs_pci_address': dev.sriov_vfs_pci_address,
'driver': dev.driver,
'enabled': dev.enabled,
'extra_info': dev.extra_info}
LOG.debug('Sysinv Agent dev {}'.format(pci_dev_dict))
pci_device_dict_array.append(pci_dev_dict)
try:
# may get duplicate key if already sent on earlier init
rpcapi.pci_device_update_by_host(icontext,
ihost['uuid'],
pci_device_dict_array)
except exception.SysinvException:
LOG.exception("Sysinv Agent exception updating iport conductor.")
pass
self._report_port_inventory(icontext, rpcapi,
port_list, pci_device_list)
# Find list of numa_nodes and cpus for this ihost
inumas, icpus = self._inode_operator.inodes_get_inumas_icpus()
@ -1365,6 +1383,14 @@ class AgentManager(service.PeriodicService):
self._update_config_applied(iconfig_uuid)
self._report_config_applied(context)
def _report_inventory(self, context, config_dict):
inventory_update = config_dict.get(puppet.REPORT_INVENTORY_UPDATE, None)
LOG.info("report_inventory request=%s" % inventory_update)
if inventory_update == puppet.REPORT_PCI_SRIOV_CONFIG:
self._report_port_inventory(context)
else:
LOG.error("report_inventory unknown request=%s" % inventory_update)
def _retry_on_missing_mgmt_ip(ex):
LOG.info('Caught exception. Retrying... Exception: {}'.format(ex))
return isinstance(ex, exception.LocalManagementIpNotFound)
@ -1466,6 +1492,9 @@ class AgentManager(service.PeriodicService):
status=puppet.REPORT_SUCCESS,
error=None)
if config_dict.get(puppet.REPORT_INVENTORY_UPDATE):
self._report_inventory(context, config_dict)
self._report_config_applied(context)
def _apply_runtime_manifest(self, config_dict, hieradata_path=PUPPET_HIERADATA_PATH):

View File

@ -46,8 +46,6 @@ class AgentAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
if topic is None:
topic = MANAGER_TOPIC
# if host is None: ? JKUNG
super(AgentAPI, self).__init__(
topic=topic,
serializer=objects_base.SysinvObjectSerializer(),

View File

@ -3172,6 +3172,40 @@ class HostController(rest.RestController):
'count': count})
raise wsme.exc.ClientSideError(msg)
@staticmethod
def _semantic_check_sriov_interface(host, interface, force_unlock=False):
"""
Perform semantic checks on an SRIOV interface.
"""
if (force_unlock or
interface.ifclass != constants.INTERFACE_CLASS_PCI_SRIOV):
return
if_configured_sriov_numvfs = interface.sriov_numvfs
if not if_configured_sriov_numvfs:
return
ports = pecan.request.dbapi.port_get_by_host_interface(
host['id'], interface.id)
for p in ports:
if (p.sriov_vfs_pci_address and
if_configured_sriov_numvfs ==
len(p.sriov_vfs_pci_address.split(','))):
LOG.info("check sriov_numvfs=%s sriov_vfs_pci_address=%s" %
(if_configured_sriov_numvfs, p.sriov_vfs_pci_address))
break
else:
msg = (_("Expecting number of interface sriov_numvfs=%s. "
"Please wait a few minutes for inventory update and "
"retry host-unlock." %
if_configured_sriov_numvfs))
LOG.info(msg)
pecan.request.rpcapi.update_sriov_config(
pecan.request.context,
host['uuid'])
raise wsme.exc.ClientSideError(msg)
def _semantic_check_unlock_upgrade(self, ihost, force_unlock=False):
"""
Perform semantic checks related to upgrades prior to unlocking host.
@ -3288,7 +3322,8 @@ class HostController(rest.RestController):
"is not supported by current vswitch" % p.name)
raise wsme.exc.ClientSideError(msg)
def _semantic_check_data_interfaces(self, ihost):
def _semantic_check_data_interfaces(
self, ihost, kubernetes_config, force_unlock=False):
"""
Perform semantic checks against data interfaces to ensure validity of
the node configuration prior to unlocking it.
@ -3305,10 +3340,11 @@ class HostController(rest.RestController):
self._semantic_check_interface_addresses(ihost, iif)
if not iif.ifclass:
continue
self._semantic_check_sriov_interface(ihost, iif, force_unlock)
if iif.ifclass == constants.NETWORK_TYPE_DATA:
data_interface_configured = True
if not data_interface_configured:
if not data_interface_configured and not kubernetes_config:
msg = _("Can not unlock a worker host without data interfaces. "
"Add at least one data interface before re-attempting "
"this command.")
@ -4885,7 +4921,7 @@ class HostController(rest.RestController):
self.check_unlock_controller(hostupdate, force_unlock)
if cutils.host_has_function(hostupdate.ihost_patch, constants.WORKER):
self.check_unlock_worker(hostupdate)
self.check_unlock_worker(hostupdate, force_unlock)
elif personality == constants.STORAGE:
self.check_unlock_storage(hostupdate)
@ -5098,7 +5134,7 @@ class HostController(rest.RestController):
if utils.get_https_enabled():
self._semantic_check_tpm_config(hostupdate.ihost_orig)
def check_unlock_worker(self, hostupdate):
def check_unlock_worker(self, hostupdate, force_unlock=False):
"""Check semantics on host-unlock of a worker."""
LOG.info("%s ihost check_unlock_worker" % hostupdate.displayid)
ihost = hostupdate.ihost_orig
@ -5110,8 +5146,13 @@ class HostController(rest.RestController):
# Check whether a restore was properly completed
self._semantic_check_restore_complete(ihost)
# Disable worker unlock checks in a kubernetes config
if not utils.is_kubernetes_config():
# Disable certain worker unlock checks in a kubernetes config
kubernetes_config = utils.is_kubernetes_config()
if kubernetes_config:
self._semantic_check_data_interfaces(ihost,
kubernetes_config,
force_unlock)
else:
# sdn configuration check
self._semantic_check_sdn_attributes(ihost)
@ -5119,7 +5160,9 @@ class HostController(rest.RestController):
self._semantic_check_data_routes(ihost)
# check whether data interfaces have been configured
self._semantic_check_data_interfaces(ihost)
self._semantic_check_data_interfaces(ihost,
kubernetes_config,
force_unlock)
self._semantic_check_data_addresses(ihost)
self._semantic_check_data_vrs_attributes(ihost)

View File

@ -537,7 +537,8 @@ class InterfaceController(rest.RestController):
temp_interface[
'ifclass'] != constants.INTERFACE_CLASS_PCI_SRIOV):
temp_interface['sriov_numvfs'] = None
_check_interface_sriov(temp_interface.as_dict(), ihost)
sriov_update = _check_interface_sriov(temp_interface.as_dict(), ihost)
# Get the ethernet port associated with the interface if network type
# is changed
@ -785,6 +786,10 @@ class InterfaceController(rest.RestController):
# Update shared data interface bindings, if required
_update_shared_interface_neutron_bindings(ihost, new_interface)
if sriov_update:
pecan.request.rpcapi.update_sriov_config(
pecan.request.context,
ihost['uuid'])
return Interface.convert_with_links(new_interface)
except Exception as e:
@ -1021,8 +1026,10 @@ def _check_interface_mtu(interface, ihost, from_profile=False):
def _check_interface_sriov(interface, ihost, from_profile=False):
sriov_update = False
if 'ifclass' in interface.keys() and not interface['ifclass']:
return interface
return sriov_update
if (interface['ifclass'] == constants.INTERFACE_CLASS_PCI_SRIOV and
'sriov_numvfs' not in interface.keys()):
@ -1069,8 +1076,8 @@ def _check_interface_sriov(interface, ihost, from_profile=False):
driver = port_list[0][2]
if driver is None or not driver:
raise wsme.exc.ClientSideError(_("Corresponding port has invalid driver"))
return interface
sriov_update = True
return sriov_update
def _check_host(ihost):

View File

@ -1310,6 +1310,10 @@ class LocalManagementIpNotFound(NotFound):
"host_personality=%(host_personality)s")
class LocalHostUUIDNotFound(NotFound):
message = _("Local Host UUID not found")
class InvalidHelmDockerImageSource(Invalid):
message = _("Invalid docker image source: %(source)s. Must be one of %(valid_srcs)s")

View File

@ -5395,6 +5395,30 @@ class ConductorManager(service.PeriodicService):
self._config_apply_runtime_manifest(context, config_uuid, config_dict)
def update_sriov_config(self, context, host_uuid):
"""update sriov configuration for a host
:param context: an admin context
:param host_uuid: the host uuid
"""
# update manifest files and notify agent to apply them
personalities = [constants.CONTROLLER,
constants.WORKER]
config_uuid = self._config_update_hosts(context, personalities,
host_uuids=[host_uuid])
config_dict = {
"personalities": personalities,
'host_uuids': host_uuid,
"classes": 'platform::network::runtime',
puppet_common.REPORT_INVENTORY_UPDATE:
puppet_common.REPORT_PCI_SRIOV_CONFIG,
}
self._config_apply_runtime_manifest(
context, config_uuid, config_dict, force=True)
def configure_system_https(self, context):
"""Update the system https configuration.

View File

@ -516,6 +516,22 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
" update_route_config to conductor")
return self.call(context, self.make_msg('update_route_config'))
def update_sriov_config(self, context, host_uuid):
"""Synchronously, have a conductor configure sriov config.
Does the following tasks:
- sends a message to conductor
- who sends a message to all inventory agents
- who each apply the network manifest
:param context: request context.
:param host_uuid: the host unique uuid
"""
LOG.debug("ConductorApi.update_sriov_config: sending "
"update_sriov_config to conductor")
return self.call(context, self.make_msg('update_sriov_config',
host_uuid=host_uuid))
def update_distributed_cloud_role(self, context):
"""Synchronously, have a conductor configure the distributed cloud
role of the system.

View File

@ -554,9 +554,9 @@ def add_port_filter_by_host_interface(query, hostid, interfaceid):
elif utils.is_uuid_like(hostid) and utils.is_uuid_like(interfaceid):
query = query.join(models.ihost,
models.Interface)
models.Interfaces)
return query.filter(models.ihost.uuid == hostid,
models.Interface.uuid == interfaceid)
models.Interfaces.uuid == interfaceid)
LOG.debug("port_filter_by_host_iinterface: "
"No match for supplied filter ids (%s, %s)"

View File

@ -25,6 +25,8 @@ REPORT_STATUS_CFG = 'report_status'
REPORT_SUCCESS = 'report_success'
REPORT_FAILURE = 'report_failure'
REPORT_INVENTORY_UPDATE = 'inventory_update'
# name of manifest config operations to report back to sysinv conductor
REPORT_AIO_CINDER_CONFIG = 'aio_cinder_config'
REPORT_DISK_PARTITON_CONFIG = 'manage_disk_partitions'
@ -34,6 +36,7 @@ REPORT_CEPH_BACKEND_CONFIG = 'ceph_config'
REPORT_CEPH_EXTERNAL_BACKEND_CONFIG = 'ceph_external_config'
REPORT_CEPH_SERVICES_CONFIG = 'ceph_services'
REPORT_CEPH_MONITOR_CONFIG = 'ceph_monitor'
REPORT_PCI_SRIOV_CONFIG = 'pci_sriov_config'
def puppet_apply_manifest(ip_address, personality,