Merge "N3000 FEC device config does not recover on host re-install"

This commit is contained in:
Zuul 2021-06-29 20:19:01 +00:00 committed by Gerrit Code Review
commit 159a7c05e9
10 changed files with 257 additions and 67 deletions

View File

@ -61,6 +61,7 @@ from sysinv.common import constants
from sysinv.common import exception
from sysinv.common import service
from sysinv.common import utils
from sysinv.fpga_agent import constants as fpga_constants
from sysinv.objects import base as objects_base
from sysinv.puppet import common as puppet
from sysinv.conductor import rpcapi as conductor_rpcapi
@ -666,6 +667,9 @@ class AgentManager(service.PeriodicService):
port_list.append(inic_dict)
is_fpga_n3000_reset = \
os.path.exists(fpga_constants.N3000_RESET_FLAG)
for dev in pci_devs:
pci_dev_dict = {'name': dev.name,
'pciaddr': dev.pci.pciaddr,
@ -686,7 +690,8 @@ class AgentManager(service.PeriodicService):
'sriov_vf_pdevice_id': dev.sriov_vf_pdevice_id,
'driver': dev.driver,
'enabled': dev.enabled,
'extra_info': dev.extra_info}
'extra_info': dev.extra_info,
'fpga_n3000_reset': is_fpga_n3000_reset}
LOG.debug('Sysinv Agent dev {}'.format(pci_dev_dict))
pci_device_list.append(pci_dev_dict)

View File

@ -3378,35 +3378,47 @@ class HostController(rest.RestController):
if not if_configured_sriov_numvfs:
return
ports = pecan.request.dbapi.port_get_by_host_interface(
host['id'], interface.id)
for p in ports:
if (p.sriov_vfs_pci_address and
if_configured_sriov_numvfs ==
len(p.sriov_vfs_pci_address.split(','))):
LOG.info("check sriov_numvfs=%s sriov_vfs_pci_address=%s" %
(if_configured_sriov_numvfs, p.sriov_vfs_pci_address))
break
else:
msg = (_("Expecting number of interface sriov_numvfs=%s. "
"Please wait a few minutes for inventory update and "
"retry host-unlock." %
if_configured_sriov_numvfs))
LOG.info(msg)
pecan.request.rpcapi.update_sriov_config(
pecan.request.context,
host['uuid'])
raise wsme.exc.ClientSideError(msg)
# we might be preventing the configuration needed to have valid
# SRIOV port data related to the interface
check_sriov_port_data = True
if (not utils.is_host_active_controller(host)
and host['config_status'] == constants.CONFIG_STATUS_OUT_OF_DATE
and host['administrative'] == constants.ADMIN_LOCKED):
flip = utils.config_flip_reboot_required(host['config_target'])
if(utils.config_is_reboot_required(host['config_target'])
and flip == host['config_applied']):
check_sriov_port_data = False
for p in ports:
if (interface.sriov_vf_driver == constants.SRIOV_DRIVER_TYPE_NETDEVICE and
p.sriov_vf_driver is None):
msg = (_("Value for SR-IOV VF driver is %s, but "
"corresponding port has an invalid driver" %
constants.SRIOV_DRIVER_TYPE_NETDEVICE))
if check_sriov_port_data:
ports = pecan.request.dbapi.port_get_by_host_interface(
host['id'], interface.id)
for p in ports:
if (p.sriov_vfs_pci_address and
if_configured_sriov_numvfs ==
len(p.sriov_vfs_pci_address.split(','))):
LOG.info("check sriov_numvfs=%s sriov_vfs_pci_address=%s" %
(if_configured_sriov_numvfs, p.sriov_vfs_pci_address))
break
else:
msg = (_("Expecting number of interface sriov_numvfs=%s. "
"Please wait a few minutes for inventory update and "
"retry host-unlock." %
if_configured_sriov_numvfs))
LOG.info(msg)
pecan.request.rpcapi.update_sriov_config(
pecan.request.context,
host['uuid'])
raise wsme.exc.ClientSideError(msg)
for p in ports:
if (interface.sriov_vf_driver == constants.SRIOV_DRIVER_TYPE_NETDEVICE and
p.sriov_vf_driver is None):
msg = (_("Value for SR-IOV VF driver is %s, but "
"corresponding port has an invalid driver" %
constants.SRIOV_DRIVER_TYPE_NETDEVICE))
LOG.info(msg)
raise wsme.exc.ClientSideError(msg)
self._check_sriovdp_interface_datanets(interface)
def _semantic_check_pcipt_interface(self, host, interface, force_unlock=False):

View File

@ -26,6 +26,7 @@ import os
import pecan
import re
import socket
import uuid
import wsme
from fm_api import constants as fm_constants
@ -375,6 +376,27 @@ def update_address_mode(interface, family, mode, pool):
pecan.request.dbapi.address_mode_update(interface_id, updates)
def config_is_reboot_required(config_uuid):
"""Check if the supplied config_uuid has the reboot required flag
:param config_uuid UUID object or UUID string
:return True if reboot is required, False otherwise
"""
return int(uuid.UUID(config_uuid)) & constants.CONFIG_REBOOT_REQUIRED
def config_flip_reboot_required(config_uuid):
"""flip the reboot required flag for the supplied UUID
:param config_uuid UUID object or UUID string
:return The modified UUID as a string
:rtype str
"""
uuid_str = str(config_uuid)
uuid_int = int(uuid.UUID(uuid_str)) ^ constants.CONFIG_REBOOT_REQUIRED
return str(uuid.UUID(int=uuid_int))
class SystemHelper(object):
@staticmethod
def get_product_build():

View File

@ -1913,3 +1913,6 @@ KUBE_CERT_UPDATE_TRUSTNEWCA = "trust-new-ca"
# kubernetes components secrets on rootCA update procedure
KUBE_ROOTCA_SECRET = 'system-kube-rootca-certificate'
KUBE_ROOTCA_ISSUER = 'system-kube-rootca-issuer'
# configuration UUID reboot required flag (bit)
CONFIG_REBOOT_REQUIRED = (1 << 127)

View File

@ -170,9 +170,6 @@ CONFIG_FAIL_FLAG = os.path.join(tsc.VOLATILE_PATH, ".config_fail")
ACTIVE_CONFIG_REBOOT_REQUIRED = os.path.join(
constants.SYSINV_VOLATILE_PATH, ".reboot_required")
# configuration UUID reboot required flag (bit)
CONFIG_REBOOT_REQUIRED = (1 << 127)
# Types of runtime configuration applies
CONFIG_APPLY_RUNTIME_MANIFEST = 'config_apply_runtime_manifest'
CONFIG_UPDATE_FILE = 'config_update_file'
@ -1542,7 +1539,7 @@ class ConductorManager(service.PeriodicService):
# flag after they have been applied.
config_uuid = self._config_update_hosts(context, personalities,
host_uuids=[host.uuid])
if self._config_is_reboot_required(host.config_target):
if utils.config_is_reboot_required(host.config_target):
config_uuid = self._config_set_reboot_required(config_uuid)
config_dict = {
@ -1556,7 +1553,7 @@ class ConductorManager(service.PeriodicService):
# Regenerate config target uuid, node is going for reboot!
config_uuid = self._config_update_hosts(context, personalities)
if self._config_is_reboot_required(host.config_target):
if utils.config_is_reboot_required(host.config_target):
config_uuid = self._config_set_reboot_required(config_uuid)
self._puppet.update_host_config(host, config_uuid)
@ -2734,6 +2731,12 @@ class ConductorManager(service.PeriodicService):
return
for pci_dev in pci_device_dict_array:
LOG.debug("Processing dev %s" % pci_dev)
is_n3000_dev_not_reset = False
if 'fpga_n3000_reset' in pci_dev.keys():
is_n3000_dev_not_reset = (pci_dev['pdevice_id'] in fpga_constants.N3000_DEVICES
and pci_dev['pvendor_id'] == fpga_constants.N3000_VENDOR
and not pci_dev['fpga_n3000_reset'])
del pci_dev['fpga_n3000_reset']
try:
pci_dev_dict = {'host_id': host['id']}
pci_dev_dict.update(pci_dev)
@ -2743,11 +2746,19 @@ class ConductorManager(service.PeriodicService):
hostid=host['id'])
dev_found = dev
if not dev:
if is_n3000_dev_not_reset:
LOG.info("N3000 reset not executed, skip for dev="
"%s on host %s" % (pci_dev_dict, host['id']))
continue
LOG.info("Attempting to create new device "
"%s on host %s" % (pci_dev_dict, host['id']))
dev = self.dbapi.pci_device_create(host['id'],
pci_dev_dict)
except Exception:
if is_n3000_dev_not_reset:
LOG.info("N3000 reset not executed, skip for dev="
"%s on host %s" % (pci_dev_dict, host['id']))
continue
LOG.info("Attempting to create new device "
"%s on host %s" % (pci_dev_dict, host['id']))
dev = self.dbapi.pci_device_create(host['id'],
@ -2788,6 +2799,10 @@ class ConductorManager(service.PeriodicService):
# binding of the intended driver has not had a
# chance to be applied.
del attr['sriov_vf_driver']
if is_n3000_dev_not_reset:
LOG.info("N3000 reset not executed, skip for dev="
"%s on host %s" % (pci_dev_dict, host['id']))
continue
dev = self.dbapi.pci_device_update(dev['uuid'], attr)
except Exception:
LOG.exception("Failed to update port %s" %
@ -5130,7 +5145,7 @@ class ConductorManager(service.PeriodicService):
# apply filesystem config changes if all controllers at target
standby_config_target_flipped = None
if standby_host and standby_host.config_target:
standby_config_target_flipped = self._config_flip_reboot_required(standby_host.config_target)
standby_config_target_flipped = utils.config_flip_reboot_required(standby_host.config_target)
if not standby_host or (standby_host and
(standby_host.config_applied == standby_host.config_target or
standby_host.config_applied == standby_config_target_flipped)):
@ -5154,10 +5169,10 @@ class ConductorManager(service.PeriodicService):
# Ignore the reboot required bit for active controller when doing the comparison
active_config_target_flipped = None
if active_host and active_host.config_target:
active_config_target_flipped = self._config_flip_reboot_required(active_host.config_target)
active_config_target_flipped = utils.config_flip_reboot_required(active_host.config_target)
standby_config_target_flipped = None
if standby_host and standby_host.config_target:
standby_config_target_flipped = self._config_flip_reboot_required(standby_host.config_target)
standby_config_target_flipped = utils.config_flip_reboot_required(standby_host.config_target)
if active_host and active_config_target_flipped and \
active_host.config_applied == active_config_target_flipped:
# apply filesystem config changes if all controllers at target
@ -6498,7 +6513,7 @@ class ConductorManager(service.PeriodicService):
host.administrative == constants.ADMIN_UNLOCKED and
host.operational == constants.OPERATIONAL_ENABLED and
not (self._config_out_of_date(context, host) and
self._config_is_reboot_required(host.config_target))):
utils.config_is_reboot_required(host.config_target))):
runtime_hosts.append(host.uuid)
if runtime_hosts:
@ -9562,15 +9577,6 @@ class ConductorManager(service.PeriodicService):
ihost_obj.config_status = None
ihost_obj.save(context)
@staticmethod
def _config_is_reboot_required(config_uuid):
"""Check if the supplied config_uuid has the reboot required flag
:param config_uuid UUID object or UUID string
:return True if reboot is required, False otherwise
"""
return int(uuid.UUID(config_uuid)) & CONFIG_REBOOT_REQUIRED
@staticmethod
def _config_set_reboot_required(config_uuid):
"""Set the reboot required flag for the supplied UUID
@ -9580,7 +9586,7 @@ class ConductorManager(service.PeriodicService):
:rtype str
"""
uuid_str = str(config_uuid)
uuid_int = int(uuid.UUID(uuid_str)) | CONFIG_REBOOT_REQUIRED
uuid_int = int(uuid.UUID(uuid_str)) | constants.CONFIG_REBOOT_REQUIRED
return str(uuid.UUID(int=uuid_int))
@staticmethod
@ -9592,19 +9598,7 @@ class ConductorManager(service.PeriodicService):
:rtype str
"""
uuid_str = str(config_uuid)
uuid_int = int(uuid.UUID(uuid_str)) & ~CONFIG_REBOOT_REQUIRED
return str(uuid.UUID(int=uuid_int))
@staticmethod
def _config_flip_reboot_required(config_uuid):
"""flip the reboot required flag for the supplied UUID
:param config_uuid UUID object or UUID string
:return The modified UUID as a string
:rtype str
"""
uuid_str = str(config_uuid)
uuid_int = int(uuid.UUID(uuid_str)) ^ CONFIG_REBOOT_REQUIRED
uuid_int = int(uuid.UUID(uuid_str)) & ~constants.CONFIG_REBOOT_REQUIRED
return str(uuid.UUID(int=uuid_int))
def _update_host_config_reinstall(self, context, ihost_obj):
@ -9629,7 +9623,7 @@ class ConductorManager(service.PeriodicService):
# reboot required is still present
if (ihost_obj.config_target and
ihost_obj.config_applied != ihost_obj.config_target):
if self._config_is_reboot_required(ihost_obj.config_target):
if utils.config_is_reboot_required(ihost_obj.config_target):
config_uuid = self._config_set_reboot_required(config_uuid)
LOG.info("Setting config target of "
"host '%s' to '%s'." % (ihost_obj.hostname, config_uuid))

View File

@ -4,6 +4,9 @@
# SPDX-License-Identifier: Apache-2.0
#
import os
import tsconfig.tsconfig as tsc
# Currently we only support the following FPGA. In the future we may need to
# expand this to a list of devices, each with their own special set of
# device-specific information.
@ -30,3 +33,5 @@ OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0"
# This is a flag file created by puppet after doing a "docker login".
# We need to wait for it to exist before trying to run docker images.
DOCKER_LOGIN_FLAG = "/var/run/docker_login_done"
N3000_RESET_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_n3000_reset")

View File

@ -339,6 +339,9 @@ def get_n3000_pci_info():
for dev in pci_dev_array:
pci_devs.append(pci.PCIDevice(pci_dev, **dev))
is_fpga_n3000_reset = \
os.path.exists(constants.N3000_RESET_FLAG)
for dev in pci_devs:
pci_dev_dict = {'name': dev.name,
'pciaddr': dev.pci.pciaddr,
@ -359,7 +362,8 @@ def get_n3000_pci_info():
'sriov_vf_pdevice_id': dev.sriov_vf_pdevice_id,
'driver': dev.driver,
'enabled': dev.enabled,
'extra_info': dev.extra_info}
'extra_info': dev.extra_info,
'fpga_n3000_reset': is_fpga_n3000_reset}
LOG.debug('Sysinv FPGA Agent dev {}'.format(pci_dev_dict))
pci_device_list.append(pci_dev_dict)
except Exception:

View File

@ -26,10 +26,8 @@ from sysinv.common import utils
from sysinv.common import exception
from sysinv.fpga_agent.manager import get_n3000_devices
from sysinv.fpga_agent import constants
import tsconfig.tsconfig as tsc
# Volatile flag file so we only reset the N3000s once after bootup.
N3000_RESET_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_n3000_reset")
LOG = log.getLogger(__name__)
@ -75,7 +73,7 @@ def reset_device_n3000(pci_addr):
def reset_n3000_fpgas():
if not os.path.exists(N3000_RESET_FLAG):
if not os.path.exists(constants.N3000_RESET_FLAG):
# Reset all N3000 FPGAs on the system.
# TODO: make this run in parallel if there are multiple devices.
LOG.info("Resetting N3000 FPGAs.")
@ -94,7 +92,7 @@ def reset_n3000_fpgas():
LOG.info("Done resetting N3000 FPGAs.")
if not got_exception:
utils.touch(N3000_RESET_FLAG)
utils.touch(constants.N3000_RESET_FLAG)
return True
else:
return False

View File

@ -27,7 +27,7 @@ from sysinv.agent.pci import PCIOperator
from sysinv.agent.pci import PCI
from sysinv.agent.manager import AgentManager
from sysinv.tests import base
from sysinv.fpga_agent import constants as fpga_constants
import tsconfig.tsconfig as tsc
FAKE_LSPCI_OUTPUT = {
@ -234,6 +234,26 @@ class TestAgentOperator(base.TestCase):
mock_exists.side_effect = file_exists_side_effect
ports, devices, macs = self._get_ports_inventory()
for dev in devices:
assert dev['fpga_n3000_reset'] is False
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1
@mock.patch('os.path.exists')
def test_get_pci_inventory_n3000_reset_flag(self, mock_exists):
def file_exists_side_effect(filename):
if filename in [tsc.INITIAL_WORKER_CONFIG_COMPLETE,
tsc.VOLATILE_WORKER_CONFIG_COMPLETE,
fpga_constants.N3000_RESET_FLAG]:
return True
else:
return False
mock_exists.side_effect = file_exists_side_effect
ports, devices, macs = self._get_ports_inventory()
for dev in devices:
assert dev['fpga_n3000_reset'] is True
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1

View File

@ -1761,6 +1761,131 @@ class ManagerTestCase(base.DbTestCase):
dev = self.dbapi.pci_device_get(PCI_DEV_2['pciaddr'], host_id)
self.assertEqual(dev['uuid'], PCI_DEV_2['uuid'])
def test_pci_device_update_n3000_by_host(self):
# Create compute-0 node
config_uuid = str(uuid.uuid4())
ihost = self._create_test_ihost(
personality=constants.WORKER,
hostname='compute-0',
uuid=str(uuid.uuid4()),
config_status=None,
config_applied=config_uuid,
config_target=config_uuid,
invprovision=constants.PROVISIONED,
administrative=constants.ADMIN_UNLOCKED,
operational=constants.OPERATIONAL_ENABLED,
availability=constants.AVAILABILITY_ONLINE,
)
host_uuid = ihost['uuid']
host_id = ihost['id']
PCI_DEV_1 = {'uuid': str(uuid.uuid4()),
'name': 'pci_dev_1',
'pciaddr': '0000:0b:01.0',
'pclass_id': '060100',
'pvendor_id': '8086',
'pdevice_id': '0443',
'enabled': True,
'fpga_n3000_reset': True} # is the FPGA reset
PCI_DEV_2 = {'uuid': str(uuid.uuid4()),
'name': 'pci_0000_b4_00_0',
'pciaddr': '0000:b4:00.0',
'pclass_id': '120000',
'pvendor_id': '8086',
'pdevice_id': '0d8f', # N3000 FEC
'enabled': True,
'fpga_n3000_reset': True} # is the FPGA reset
pci_device_dict_array = [PCI_DEV_1, PCI_DEV_2]
# create new dev
self.service.pci_device_update_by_host(self.context, host_uuid, pci_device_dict_array)
dev = self.dbapi.pci_device_get(PCI_DEV_1['pciaddr'], host_id)
for key in PCI_DEV_1:
self.assertEqual(dev[key], PCI_DEV_1[key])
dev = self.dbapi.pci_device_get(PCI_DEV_2['pciaddr'], host_id)
for key in PCI_DEV_2:
self.assertEqual(dev[key], PCI_DEV_2[key])
# test with fpga_n3000_reset as False
PCI_DEV_3 = {'uuid': str(uuid.uuid4()),
'name': 'pci_dev_3',
'pciaddr': '0000:0c:01.0',
'pclass_id': '060100',
'pvendor_id': '8086',
'pdevice_id': '0443',
'enabled': True,
'fpga_n3000_reset': False} # is the FPGA reset
PCI_DEV_4 = {'uuid': str(uuid.uuid4()),
'name': 'pci_0000_b8_00_0',
'pciaddr': '0000:b8:00.0',
'pclass_id': '120000',
'pvendor_id': '8086',
'pdevice_id': '0d8f', # N3000_FEC_PF_DEVICE
'enabled': True,
'fpga_n3000_reset': False} # is the FPGA reset
PCI_DEV_5 = {'uuid': str(uuid.uuid4()),
'name': 'pci_0000_b9_00_0',
'pciaddr': '0000:b9:00.0',
'pclass_id': '120000',
'pvendor_id': '8086',
'pdevice_id': '0b30', # N3000_DEVICE
'enabled': True,
'fpga_n3000_reset': False} # is the FPGA reset
PCI_DEV_6 = {'uuid': str(uuid.uuid4()),
'name': 'pci_0000_b0_00_0',
'pciaddr': '0000:b0:00.0',
'pclass_id': '120000',
'pvendor_id': '8086',
'pdevice_id': '0b32', # N3000_DEFAULT_DEVICE
'enabled': True,
'fpga_n3000_reset': False} # is the FPGA reset
pci_device_dict_array2 = [PCI_DEV_3, PCI_DEV_4, PCI_DEV_5, PCI_DEV_6]
self.service.pci_device_update_by_host(self.context, host_uuid, pci_device_dict_array2)
dev = self.dbapi.pci_device_get(PCI_DEV_3['pciaddr'], host_id)
for key in PCI_DEV_3:
self.assertEqual(dev[key], PCI_DEV_3[key])
self.assertRaises(exception.ServerNotFound,
self.dbapi.pci_device_get, PCI_DEV_4['pciaddr'], host_id)
self.assertRaises(exception.ServerNotFound,
self.dbapi.pci_device_get, PCI_DEV_5['pciaddr'], host_id)
self.assertRaises(exception.ServerNotFound,
self.dbapi.pci_device_get, PCI_DEV_6['pciaddr'], host_id)
# update existing dev
pci_dev_dict_update = [{'pciaddr': PCI_DEV_2['pciaddr'],
'pclass_id': '060500',
'pvendor_id': '8086',
'pdevice_id': '0d8f',
'pclass': '0600',
'pvendor': '',
'psvendor': '',
'psdevice': 'qat',
'sriov_totalvfs': 32,
'sriov_numvfs': 4,
'sriov_vf_driver': 'vfio-pci',
'sriov_vf_pdevice_id': '0d90',
'sriov_vfs_pci_address': '000:b4:00.1,0000:b4:00.2,0000:b4:00.3',
'driver': 'igb_uio',
'fpga_n3000_reset': True}]
self.service.pci_device_update_by_host(self.context, host_uuid, pci_dev_dict_update)
dev = self.dbapi.pci_device_get(PCI_DEV_2['pciaddr'], host_id)
for key in pci_dev_dict_update[0]:
self.assertEqual(dev[key], pci_dev_dict_update[0][key])
pci_dev_dict_update[0]['sriov_vfs_pci_address'] = ''
pci_dev_dict_update[0]['fpga_n3000_reset'] = False
self.service.pci_device_update_by_host(self.context, host_uuid, pci_dev_dict_update)
dev = self.dbapi.pci_device_get(PCI_DEV_2['pciaddr'], host_id)
self.assertNotEqual(dev['sriov_vfs_pci_address'],
pci_dev_dict_update[0]['sriov_vfs_pci_address'])
def test_inumas_update_by_ihost(self):
# Create compute-0 node
config_uuid = str(uuid.uuid4())
@ -1813,14 +1938,16 @@ class ManagerTestCase(base.DbTestCase):
'pclass_id': '060100',
'pvendor_id': '8086',
'pdevice_id': '0443',
'enabled': True}
'enabled': True,
'fpga_n3000_reset': True}
PCI_DEV_2 = {'uuid': str(uuid.uuid4()),
'name': 'pci_dev_2',
'pciaddr': '0000:0c:01.0',
'pclass_id': '012000',
'pvendor_id': '8086',
'pdevice_id': '0b30',
'enabled': True}
'enabled': True,
'fpga_n3000_reset': True}
pci_device_dict_array = [PCI_DEV_1, PCI_DEV_2]
# create new PCI dev