Merge "Add sysinv-reset-n3000-fpgas cmd"

This commit is contained in:
Zuul 2021-04-22 20:53:02 +00:00 committed by Gerrit Code Review
commit 209d0c3d7a
5 changed files with 153 additions and 58 deletions

View File

@ -39,6 +39,7 @@ console_scripts =
sysinv-helm = sysinv.cmd.helm:main sysinv-helm = sysinv.cmd.helm:main
sysinv-utils = sysinv.cmd.utils:main sysinv-utils = sysinv.cmd.utils:main
cert-mon = sysinv.cmd.cert_mon:main cert-mon = sysinv.cmd.cert_mon:main
sysinv-reset-n3000-fpgas = sysinv.cmd.reset_n3000_fpgas:main
systemconfig.puppet_plugins = systemconfig.puppet_plugins =
001_platform = sysinv.puppet.platform:PlatformPuppet 001_platform = sysinv.puppet.platform:PlatformPuppet

View File

@ -0,0 +1,41 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2021 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
from oslo_config import cfg
from oslo_log import log as logging
from sysinv.fpga_agent.reset_n3000_fpgas import reset_n3000_fpgas
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
def main():
logging.register_options(CONF)
CONF(project='sysinv', prog='reset-n3000-fpgas')
logging.set_defaults()
logging.setup(cfg.CONF, 'reset-n3000-fpgas')
if reset_n3000_fpgas():
exit(0)
else:
exit(1)
if __name__ == '__main__':
main()

View File

@ -22,3 +22,11 @@ N3000_DEVICES = [
N3000_FEC_PF_DEVICE, N3000_FEC_PF_DEVICE,
N3000_DEFAULT_DEVICE, N3000_DEFAULT_DEVICE,
] ]
# TODO: Make this specified in the config file.
# This is the docker image containing the OPAE tools to access the FPGA device.
OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0"
# This is a flag file created by puppet after doing a "docker login".
# We need to wait for it to exist before trying to run docker images.
DOCKER_LOGIN_FLAG = "/var/run/docker_login_done"

View File

@ -81,17 +81,10 @@ CONF.register_opts(agent_opts, 'fpga_agent')
# This is the docker image containing the OPAE tools to access the FPGA device. # This is the docker image containing the OPAE tools to access the FPGA device.
OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0" OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0"
# This is a flag file created by puppet after doing a "docker login".
# We need to wait for it to exist before trying to run docker images.
DOCKER_LOGIN_FLAG = "/var/run/docker_login_done"
# This is the location where we cache the device image file while # This is the location where we cache the device image file while
# writing it to the hardware. # writing it to the hardware.
DEVICE_IMAGE_CACHE_DIR = "/usr/local/share/applications/sysinv" DEVICE_IMAGE_CACHE_DIR = "/usr/local/share/applications/sysinv"
# Volatile flag file so we only reset the N3000s once after bootup.
N3000_RESET_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_n3000_reset")
SYSFS_DEVICE_PATH = "/sys/bus/pci/devices/" SYSFS_DEVICE_PATH = "/sys/bus/pci/devices/"
FME_PATH = "/fpga/intel-fpga-dev.*/intel-fpga-fme.*/" FME_PATH = "/fpga/intel-fpga-dev.*/intel-fpga-fme.*/"
SPI_PATH = "spi-altera.*.auto/spi_master/spi*/spi*.*/" SPI_PATH = "spi-altera.*.auto/spi_master/spi*/spi*.*/"
@ -110,7 +103,7 @@ BMC_BUILD_VER_PATH = "max10_version"
def wait_for_docker_login(): def wait_for_docker_login():
# TODO: add a timeout # TODO: add a timeout
LOG.info("Waiting for docker login flag.") LOG.info("Waiting for docker login flag.")
while not os.path.exists(DOCKER_LOGIN_FLAG): while not os.path.exists(constants.DOCKER_LOGIN_FLAG):
time.sleep(1) time.sleep(1)
LOG.info("Found docker login flag, continuing.") LOG.info("Found docker login flag, continuing.")
@ -195,33 +188,6 @@ def write_device_image_n3000(filename, pci_addr):
raise exception.SysinvException(msg) raise exception.SysinvException(msg)
def reset_device_n3000(pci_addr):
# Reset the N3000 FPGA at the specified PCI address.
try:
# Build up the command to perform the reset.
# Note the hack to work around OPAE tool locale issues
cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 "
"-e LANG=en_US.UTF-8 " + OPAE_IMG +
" rsu bmcimg " + pci_addr)
# Issue the command to perform the firmware update.
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as exc:
# "docker run" return code will be:
# 125 if the error is with Docker daemon itself
# 126 if the contained command cannot be invoked
# 127 if the contained command cannot be found
# Exit code of contained command otherwise
msg = ("Failed to reset device %s, "
"return code is %d, command output: %s." %
(pci_addr, exc.returncode,
exc.output.decode('utf-8')))
LOG.error(msg)
LOG.error("Check for intel-max10 kernel logs.")
raise exception.SysinvException(msg)
def read_n3000_sysfs_file(pattern): def read_n3000_sysfs_file(pattern):
# Read a sysfs file related to the N3000. # Read a sysfs file related to the N3000.
# The result should be an empty string if the file doesn't exist, # The result should be an empty string if the file doesn't exist,
@ -347,24 +313,6 @@ def get_n3000_devices():
return fpga_addrs return fpga_addrs
def reset_n3000_fpgas():
# We only want to do this once after host startup.
if not os.path.exists(N3000_RESET_FLAG):
# Reset all N3000 FPGAs on the system.
# TODO: make this run in parallel if there are multiple devices.
LOG.info("Resetting N3000 FPGAs.")
got_exception = False
fpga_addrs = get_n3000_devices()
for fpga_addr in fpga_addrs:
try:
reset_device_n3000(fpga_addr)
except Exception:
got_exception = True
LOG.info("Done resetting N3000 FPGAs.")
if not got_exception:
utils.touch(N3000_RESET_FLAG)
def get_n3000_pci_info(): def get_n3000_pci_info():
""" Query PCI information about N3000 PCI devices. """ Query PCI information about N3000 PCI devices.
@ -465,11 +413,6 @@ class FpgaAgentManager(service.PeriodicService):
# Wait for puppet to log in to the local docker registry # Wait for puppet to log in to the local docker registry
wait_for_docker_login() wait_for_docker_login()
# Trigger reset of N3000 FPGAs. This is needed because the PCI address
# changes on the first reset after boot.
reset_n3000_fpgas()
# Wait around until someone else updates the platform.conf file # Wait around until someone else updates the platform.conf file
# with our host UUID. # with our host UUID.
self.wait_for_host_uuid() self.wait_for_host_uuid()

View File

@ -0,0 +1,102 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2021 Wind River Systems, Inc.
#
# The right to copy, distribute, modify, or otherwise make use
# of this software may be licensed only pursuant to the terms
# of an applicable Wind River license agreement.
#
import os
import shlex
from eventlet.green import subprocess
from oslo_log import log
from sysinv.common import utils
from sysinv.common import exception
from sysinv.fpga_agent.manager import get_n3000_devices
from sysinv.fpga_agent import constants
import tsconfig.tsconfig as tsc
# Volatile flag file so we only reset the N3000s once after bootup.
N3000_RESET_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_n3000_reset")
LOG = log.getLogger(__name__)
def n3000_img_accessible():
cmd = 'docker image list "%s" --format "{{.Repository}}:{{.Tag}}"' % \
constants.OPAE_IMG
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT)
for line in items.splitlines():
if line == constants.OPAE_IMG:
LOG.info('%s image found' % constants.OPAE_IMG)
return True
LOG.info("%s image not found." % constants.OPAE_IMG)
return False
def reset_device_n3000(pci_addr):
# Reset the N3000 FPGA at the specified PCI address.
try:
# Build up the command to perform the reset.
# Note the hack to work around OPAE tool locale issues
cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 "
"-e LANG=en_US.UTF-8 " + constants.OPAE_IMG +
" rsu bmcimg " + pci_addr)
# Issue the command to perform the firmware update.
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as exc:
# "docker run" return code will be:
# 125 if the error is with Docker daemon itself
# 126 if the contained command cannot be invoked
# 127 if the contained command cannot be found
# Exit code of contained command otherwise
msg = ("Failed to reset device %s, "
"return code is %d, command output: %s." %
(pci_addr, exc.returncode,
exc.output.decode('utf-8')))
LOG.error(msg)
LOG.error("Check for intel-max10 kernel logs.")
raise exception.SysinvException(msg)
def reset_n3000_fpgas():
if not os.path.exists(N3000_RESET_FLAG):
# Reset all N3000 FPGAs on the system.
# TODO: make this run in parallel if there are multiple devices.
LOG.info("Resetting N3000 FPGAs.")
got_exception = False
fpga_addrs = get_n3000_devices()
if not n3000_img_accessible() and \
not os.path.exists(constants.DOCKER_LOGIN_FLAG):
LOG.info("Either docker image or docker login is ready, exit...")
return False
for fpga_addr in fpga_addrs:
try:
reset_device_n3000(fpga_addr)
except Exception:
got_exception = True
LOG.info("Done resetting N3000 FPGAs.")
if not got_exception:
utils.touch(N3000_RESET_FLAG)
return True
else:
return False
else:
return True