N3000 FEC device config does not recover on host re-install

Problem detected on the port update as the reports, generated before
the worker node configuration, are erasing the SRIOV parameters
obtained from the previous installation. Since the reinstall is done
in a locked state, the validation done between the number of VFs and
the number of PCI addresses is failing due to this update being done
from an unconfigured state and the node cannot finish the
configuration. The correction consists in transport the value of
worker_config_completed back to the conductor and uses it to avoid
SRIOV parameter updates from an unconfigured SRIOV port

During the stand-by controller reinstallation the active controller
might collect inventory data prior to the FPGA reset. This reset
might relocate the device's PCI addresses creating invalid entries
on the active database due to the initial inventory report. The
correction consists in transport the N3000 reset state back to the
conductor and use this information to decide if the entry will be
incorporated on the database (true if the reset was executed).

Closes-bug: 1929301

Signed-off-by: Andre Fernando Zanella Kantek <AndreFernandoZanella.Kantek@windriver.com>
Change-Id: Ie3db6f4b13abc905ff533660196e7935239fc6fb
This commit is contained in:
Andre Fernando Zanella Kantek 2021-06-15 14:36:50 -04:00
parent 9ef262a5a7
commit 4cf6aa9344
5 changed files with 182 additions and 12 deletions

View File

@ -69,6 +69,7 @@ from sysinv.openstack.common import periodic_task
from sysinv.openstack.common.rpc.common import Timeout
from sysinv.openstack.common.rpc.common import serialize_remote_exception
from sysinv.openstack.common.rpc.common import RemoteError
from sysinv.fpga_agent import constants as fpga_constants
import tsconfig.tsconfig as tsc
@ -660,12 +661,16 @@ class AgentManager(service.PeriodicService):
'speed': port.speed,
'link_mode': port.link_mode,
'dev_id': port.dev_id,
'dpdksupport': port.dpdksupport}
'dpdksupport': port.dpdksupport,
'worker_config_completed': worker_config_completed}
LOG.debug('Sysinv Agent inic {}'.format(inic_dict))
port_list.append(inic_dict)
is_fpga_n3000_reset = \
os.path.exists(fpga_constants.N3000_RESET_FLAG)
for dev in pci_devs:
pci_dev_dict = {'name': dev.name,
'pciaddr': dev.pci.pciaddr,
@ -686,7 +691,8 @@ class AgentManager(service.PeriodicService):
'sriov_vf_pdevice_id': dev.sriov_vf_pdevice_id,
'driver': dev.driver,
'enabled': dev.enabled,
'extra_info': dev.extra_info}
'extra_info': dev.extra_info,
'fpga_n3000_reset': is_fpga_n3000_reset}
LOG.debug('Sysinv Agent dev {}'.format(pci_dev_dict))
pci_device_list.append(pci_dev_dict)

View File

@ -2309,18 +2309,19 @@ class ConductorManager(service.PeriodicService):
try:
port_dict = {
'sriov_totalvfs': inic['sriov_totalvfs'],
'sriov_numvfs': inic['sriov_numvfs'],
'sriov_vfs_pci_address':
inic['sriov_vfs_pci_address'],
'sriov_vf_driver':
inic['sriov_vf_driver'],
'sriov_vf_pdevice_id':
inic['sriov_vf_pdevice_id'],
'driver': inic['driver'],
'dpdksupport': inic['dpdksupport'],
'speed': inic['speed'],
}
# this data can only be updated if the node have its initial
# configuration done
if inic['worker_config_completed'] is True:
port_dict['sriov_numvfs'] = inic['sriov_numvfs']
port_dict['sriov_vfs_pci_address'] = inic['sriov_vfs_pci_address']
port_dict['sriov_vf_driver'] = inic['sriov_vf_driver']
port_dict['sriov_vf_pdevice_id'] = inic['sriov_vf_pdevice_id']
LOG.info("port %s update attr: %s" %
(port.uuid, port_dict))
self.dbapi.ethernet_port_update(port.uuid, port_dict)
@ -2763,6 +2764,12 @@ class ConductorManager(service.PeriodicService):
return
for pci_dev in pci_device_dict_array:
LOG.debug("Processing dev %s" % pci_dev)
is_n3000_dev_not_reset = False
if 'fpga_n3000_reset' in pci_dev.keys():
is_n3000_dev_not_reset = (pci_dev['pdevice_id'] in fpga_constants.N3000_DEVICES
and pci_dev['pvendor_id'] == fpga_constants.N3000_VENDOR
and not pci_dev['fpga_n3000_reset'])
del pci_dev['fpga_n3000_reset']
try:
pci_dev_dict = {'host_id': host['id']}
pci_dev_dict.update(pci_dev)
@ -2772,11 +2779,19 @@ class ConductorManager(service.PeriodicService):
hostid=host['id'])
dev_found = dev
if not dev:
if is_n3000_dev_not_reset:
LOG.info("N3000 reset not executed, skip for dev="
"%s on host %s" % (pci_dev_dict, host['id']))
continue
LOG.info("Attempting to create new device "
"%s on host %s" % (pci_dev_dict, host['id']))
dev = self.dbapi.pci_device_create(host['id'],
pci_dev_dict)
except Exception:
if is_n3000_dev_not_reset:
LOG.info("N3000 reset not executed, skip for dev="
"%s on host %s" % (pci_dev_dict, host['id']))
continue
LOG.info("Attempting to create new device "
"%s on host %s" % (pci_dev_dict, host['id']))
dev = self.dbapi.pci_device_create(host['id'],
@ -2817,6 +2832,10 @@ class ConductorManager(service.PeriodicService):
# binding of the intended driver has not had a
# chance to be applied.
del attr['sriov_vf_driver']
if is_n3000_dev_not_reset:
LOG.info("N3000 reset not executed, skip for dev="
"%s on host %s" % (pci_dev_dict, host['id']))
continue
dev = self.dbapi.pci_device_update(dev['uuid'], attr)
except Exception:
LOG.exception("Failed to update port %s" %

View File

@ -30,3 +30,5 @@ OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0"
# This is a flag file created by puppet after doing a "docker login".
# We need to wait for it to exist before trying to run docker images.
DOCKER_LOGIN_FLAG = "/var/run/docker_login_done"
N3000_RESET_FLAG = "/var/run/.sysinv_n3000_reset"

View File

@ -27,7 +27,7 @@ from sysinv.agent.pci import PCIOperator
from sysinv.agent.pci import PCI
from sysinv.agent.manager import AgentManager
from sysinv.tests import base
from sysinv.fpga_agent import constants as fpga_constants
import tsconfig.tsconfig as tsc
FAKE_LSPCI_OUTPUT = {
@ -234,6 +234,26 @@ class TestAgentOperator(base.TestCase):
mock_exists.side_effect = file_exists_side_effect
ports, devices, macs = self._get_ports_inventory()
for dev in devices:
assert dev['fpga_n3000_reset'] is False
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1
@mock.patch('os.path.exists')
def test_get_pci_inventory_n3000_reset_flag(self, mock_exists):
def file_exists_side_effect(filename):
if filename in [tsc.INITIAL_WORKER_CONFIG_COMPLETE,
tsc.VOLATILE_WORKER_CONFIG_COMPLETE,
fpga_constants.N3000_RESET_FLAG]:
return True
else:
return False
mock_exists.side_effect = file_exists_side_effect
ports, devices, macs = self._get_ports_inventory()
for dev in devices:
assert dev['fpga_n3000_reset'] is True
assert len(ports) == 1
assert len(devices) == 1
assert len(macs) == 1

View File

@ -1732,6 +1732,127 @@ class ManagerTestCase(base.DbTestCase):
dev = self.dbapi.pci_device_get(PCI_DEV_2['pciaddr'], host_id)
self.assertEqual(dev['uuid'], PCI_DEV_2['uuid'])
def test_pci_device_update_n3000_by_host(self):
# Create compute-0 node
config_uuid = str(uuid.uuid4())
ihost = self._create_test_ihost(
personality=constants.WORKER,
hostname='compute-0',
uuid=str(uuid.uuid4()),
config_status=None,
config_applied=config_uuid,
config_target=config_uuid,
invprovision=constants.PROVISIONED,
administrative=constants.ADMIN_UNLOCKED,
operational=constants.OPERATIONAL_ENABLED,
availability=constants.AVAILABILITY_ONLINE,
)
host_uuid = ihost['uuid']
host_id = ihost['id']
PCI_DEV_1 = {'uuid': str(uuid.uuid4()),
'name': 'pci_dev_1',
'pciaddr': '0000:0b:01.0',
'pclass_id': '060100',
'pvendor_id': '8086',
'pdevice_id': '0443',
'enabled': True,
'fpga_n3000_reset': True} # is the FPGA reset
PCI_DEV_2 = {'uuid': str(uuid.uuid4()),
'name': 'pci_0000_b4_00_0',
'pciaddr': '0000:b4:00.0',
'pclass_id': '120000',
'pvendor_id': '8086',
'pdevice_id': '0d8f', # N3000 FEC
'enabled': True,
'fpga_n3000_reset': True} # is the FPGA reset
pci_device_dict_array = [PCI_DEV_1, PCI_DEV_2]
# create new dev
self.service.pci_device_update_by_host(self.context, host_uuid, pci_device_dict_array)
dev = self.dbapi.pci_device_get(PCI_DEV_1['pciaddr'], host_id)
for key in PCI_DEV_1:
self.assertEqual(dev[key], PCI_DEV_1[key])
dev = self.dbapi.pci_device_get(PCI_DEV_2['pciaddr'], host_id)
for key in PCI_DEV_2:
self.assertEqual(dev[key], PCI_DEV_2[key])
# test with fpga_n3000_reset as False
PCI_DEV_3 = {'uuid': str(uuid.uuid4()),
'name': 'pci_dev_3',
'pciaddr': '0000:0c:01.0',
'pclass_id': '060100',
'pvendor_id': '8086',
'pdevice_id': '0443',
'enabled': True,
'fpga_n3000_reset': False} # is the FPGA reset
PCI_DEV_4 = {'uuid': str(uuid.uuid4()),
'name': 'pci_0000_b8_00_0',
'pciaddr': '0000:b8:00.0',
'pclass_id': '120000',
'pvendor_id': '8086',
'pdevice_id': '0d8f', # N3000_FEC_PF_DEVICE
'enabled': True,
'fpga_n3000_reset': False} # is the FPGA reset
PCI_DEV_5 = {'uuid': str(uuid.uuid4()),
'name': 'pci_0000_b9_00_0',
'pciaddr': '0000:b9:00.0',
'pclass_id': '120000',
'pvendor_id': '8086',
'pdevice_id': '0b30', # N3000_DEVICE
'enabled': True,
'fpga_n3000_reset': False} # is the FPGA reset
PCI_DEV_6 = {'uuid': str(uuid.uuid4()),
'name': 'pci_0000_b0_00_0',
'pciaddr': '0000:b0:00.0',
'pclass_id': '120000',
'pvendor_id': '8086',
'pdevice_id': '0b32', # N3000_DEFAULT_DEVICE
'enabled': True,
'fpga_n3000_reset': False} # is the FPGA reset
pci_device_dict_array2 = [PCI_DEV_3, PCI_DEV_4, PCI_DEV_5, PCI_DEV_6]
self.service.pci_device_update_by_host(self.context, host_uuid, pci_device_dict_array2)
dev = self.dbapi.pci_device_get(PCI_DEV_3['pciaddr'], host_id)
for key in PCI_DEV_3:
self.assertEqual(dev[key], PCI_DEV_3[key])
self.assertRaises(exception.ServerNotFound, self.dbapi.pci_device_get, PCI_DEV_4['pciaddr'], host_id)
self.assertRaises(exception.ServerNotFound, self.dbapi.pci_device_get, PCI_DEV_5['pciaddr'], host_id)
self.assertRaises(exception.ServerNotFound, self.dbapi.pci_device_get, PCI_DEV_6['pciaddr'], host_id)
# update existing dev
pci_dev_dict_update = [{'pciaddr': PCI_DEV_2['pciaddr'],
'pclass_id': '060500',
'pvendor_id': '8086',
'pdevice_id': '0d8f',
'pclass': '0600',
'pvendor': '',
'psvendor': '',
'psdevice': 'qat',
'sriov_totalvfs': 32,
'sriov_numvfs': 4,
'sriov_vf_driver': 'vfio-pci',
'sriov_vf_pdevice_id': '0d90',
'sriov_vfs_pci_address': '000:b4:00.1,0000:b4:00.2,0000:b4:00.3,0000:b4:00.4',
'driver': 'igb_uio',
'fpga_n3000_reset': True}]
self.service.pci_device_update_by_host(self.context, host_uuid, pci_dev_dict_update)
dev = self.dbapi.pci_device_get(PCI_DEV_2['pciaddr'], host_id)
for key in pci_dev_dict_update[0]:
self.assertEqual(dev[key], pci_dev_dict_update[0][key])
pci_dev_dict_update[0]['sriov_vfs_pci_address'] = ''
pci_dev_dict_update[0]['fpga_n3000_reset'] = False
self.service.pci_device_update_by_host(self.context, host_uuid, pci_dev_dict_update)
dev = self.dbapi.pci_device_get(PCI_DEV_2['pciaddr'], host_id)
self.assertNotEqual(dev['sriov_vfs_pci_address'], pci_dev_dict_update[0]['sriov_vfs_pci_address'])
def test_inumas_update_by_ihost(self):
# Create compute-0 node
config_uuid = str(uuid.uuid4())
@ -1784,14 +1905,16 @@ class ManagerTestCase(base.DbTestCase):
'pclass_id': '060100',
'pvendor_id': '8086',
'pdevice_id': '0443',
'enabled': True}
'enabled': True,
'fpga_n3000_reset': True}
PCI_DEV_2 = {'uuid': str(uuid.uuid4()),
'name': 'pci_dev_2',
'pciaddr': '0000:0c:01.0',
'pclass_id': '012000',
'pvendor_id': '8086',
'pdevice_id': '0b30',
'enabled': True}
'enabled': True,
'fpga_n3000_reset': True}
pci_device_dict_array = [PCI_DEV_1, PCI_DEV_2]
# create new PCI dev