diff --git a/sysinv/sysinv-fpga-agent/centos/sysinv-fpga-agent.spec b/sysinv/sysinv-fpga-agent/centos/sysinv-fpga-agent.spec index 048a816061..c141650b0e 100644 --- a/sysinv/sysinv-fpga-agent/centos/sysinv-fpga-agent.spec +++ b/sysinv/sysinv-fpga-agent/centos/sysinv-fpga-agent.spec @@ -34,10 +34,6 @@ install -p -D -m 644 sysinv-fpga-agent.service %{buildroot}%{_unitdir}/sysinv-fp install -p -D -m 644 sysinv-conf-watcher.service %{buildroot}%{_unitdir}/sysinv-conf-watcher.service install -p -D -m 644 sysinv-conf-watcher.path %{buildroot}%{_unitdir}/sysinv-conf-watcher.path -# Workaround to call "docker login" during startup. Called by puppet. -install -d -m 755 %{buildroot}%{_exec_prefix}/local/sbin -install -p -D -m 755 run_docker_login %{buildroot}%{_exec_prefix}/local/sbin/run_docker_login - %post /usr/bin/systemctl enable sysinv-fpga-agent.service >/dev/null 2>&1 /usr/bin/systemctl enable sysinv-conf-watcher.service >/dev/null 2>&1 @@ -54,4 +50,3 @@ rm -rf $RPM_BUILD_ROOT %{_unitdir}/sysinv-fpga-agent.service %{_unitdir}/sysinv-conf-watcher.service %{_unitdir}/sysinv-conf-watcher.path -%{_exec_prefix}/local/sbin/run_docker_login diff --git a/sysinv/sysinv-fpga-agent/run_docker_login b/sysinv/sysinv-fpga-agent/run_docker_login deleted file mode 100644 index 3e35d9bfa6..0000000000 --- a/sysinv/sysinv-fpga-agent/run_docker_login +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -# -# Copyright (c) 2020 Wind River Systems, Inc. -# -# SPDX-License-Identifier: Apache-2.0 -# - -DOCKER_USERNAME=$1 -DOCKER_PASSWORD=$2 - -LNAME=$(readlink -n -f $0) -NAME=$(basename $LNAME) - -# Log info message to /var/log/daemon.log -function LOG { - logger -p daemon.info -t "${NAME}($$): " "$@" -} - - -# Wait around for the "registry.local" name to resolve -LOG "Waiting for registry.local to resolve" -while true -do - # We can't easily ask for both A and AAAA records in the same request - # because if the customer mis-configures things with a "good" nameserver - # and a non-existant nameserver dig will return "9" even though it finds - # an AAAA record on the "good" server. So we need to ask for A and AAAA - # records separately. Once we have either type of record we can proceed. - - # First check for A records for IPv4 - ADDR=`dig +short registry.local A` - if [ $? -eq 0 ] - then - # We got a response back from the server, but we need to check - # if we got an address or not. If there is no address, ADDR will - # be an empty string. - if [ -n "$ADDR" ] - then - LOG "registry.local resolved IPv4, continuing with docker login" - break - fi - fi - - # Then check for AAAA records for IPv6 - ADDR=`dig +short registry.local AAAA` - if [ $? -eq 0 ] - then - # We got a response back from the server, but we need to check - # if we got an address or not. If there is no address, ADDR will - # be an empty string. - if [ -n "$ADDR" ] - then - LOG "registry.local resolved IPv6, continuing with docker login" - break - fi - fi - - sleep 1 -done - -while true -do - res=$(docker login --password-stdin -u ${DOCKER_USERNAME} registry.local:9001 2>&1 <<< ${DOCKER_PASSWORD}) - rc=$? - if [ ${rc} -eq 0 ] - then - LOG "docker login to registry.local completed successfully" - touch /var/run/docker_login_done - break - else - LOG "docker login error ${rc} ${res}" - sleep 3 - fi -done diff --git a/sysinv/sysinv/sysinv/sysinv/fpga_agent/constants.py b/sysinv/sysinv/sysinv/sysinv/fpga_agent/constants.py index 0fdf408381..4cd1a194fd 100644 --- a/sysinv/sysinv/sysinv/sysinv/fpga_agent/constants.py +++ b/sysinv/sysinv/sysinv/sysinv/fpga_agent/constants.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2020 Wind River Systems, Inc. +# Copyright (c) 2020-2021 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # @@ -28,13 +28,11 @@ N3000_DEVICES = [ # TODO: Make this specified in the config file. # This is the docker image containing the OPAE tools to access the FPGA device. +OPAE_IMG_PREV = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0" OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.6.0-v1.0.1" -# This is a flag file created by puppet after doing a "docker login". -# We need to wait for it to exist before trying to run docker images. -DOCKER_LOGIN_FLAG = "/var/run/docker_login_done" - N3000_RESET_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_n3000_reset") +N3000_RESET_TIMEOUT = 600 # This flag is set if the N3000 requires a second reset N3000_RETIMER_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH, ".sysinv_n3000_retimer") diff --git a/sysinv/sysinv/sysinv/sysinv/fpga_agent/manager.py b/sysinv/sysinv/sysinv/sysinv/fpga_agent/manager.py index 190cef9e8f..27da85e55f 100644 --- a/sysinv/sysinv/sysinv/sysinv/fpga_agent/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/fpga_agent/manager.py @@ -17,7 +17,7 @@ # License for the specific language governing permissions and limitations # under the License. # -# Copyright (c) 2020 Wind River Systems, Inc. +# Copyright (c) 2020-2021 Wind River Systems, Inc. # @@ -97,12 +97,17 @@ BMC_FW_VER_PATH = "bmcfw_flash_ctrl/bmcfw_version" BMC_BUILD_VER_PATH = "max10_version" -def wait_for_docker_login(): - # TODO: add a timeout - LOG.info("Waiting for docker login flag.") - while not os.path.exists(constants.DOCKER_LOGIN_FLAG): +def wait_for_n3000_reset(): + LOG.info("Waiting for n3000 reset flag.") + timeout = 0 + while not os.path.exists(constants.N3000_RESET_FLAG): + if timeout > constants.N3000_RESET_TIMEOUT: + msg = ("Timeout waiting for n3000 reset flag") + LOG.info(msg) + return time.sleep(1) - LOG.info("Found docker login flag, continuing.") + timeout += 1 + LOG.info("Found n3000 reset flag, continuing.") def ensure_device_image_cache_exists(): @@ -151,31 +156,69 @@ def fetch_device_image(filename): return local_path +def cleanup_container(): + # Delete container if exists + cmd = 'ctr -n=k8s.io container list image=="%s"' % constants.OPAE_IMG + items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable + stderr=subprocess.STDOUT, + universal_newlines=True) + for line in items.splitlines(): + if constants.OPAE_IMG in line: + cmd = 'ctr -n=k8s.io container rm n3000-opae' + subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable + stderr=subprocess.STDOUT, + universal_newlines=True) + LOG.info('Deleted stale container n3000-opae') + break + + +def set_cgroup_cpuset(): + # Set CPU affinity by updating the cpuset.cpus + platform_cpulist = '0' + cpuset_path = '/sys/fs/cgroup/cpuset/platform/' + cpuset_file = os.path.join(cpuset_path, 'cpuset.cpus') + if not os.path.exists(cpuset_path): + os.makedirs(cpuset_path) + with open('/etc/platform/worker_reserved.conf', 'r') as infile: + for line in infile: + if "PLATFORM_CPU_LIST" in line: + val = line.split("=") + platform_cpulist = val[1].strip('\n')[1:-1].strip('"') + with open(cpuset_file, 'w') as fd: + LOG.info("Writing %s to file %s" % (platform_cpulist, cpuset_file)) + fd.write(platform_cpulist) + + def write_device_image_n3000(filename, pci_addr): # Write the firmware image to the FPGA at the specified PCI address. # We're assuming that the image update tools will catch the scenario # where the image is not compatible with the device. + + # If the container exists, the host probably rebooted during + # a device update. Delete the container. + cleanup_container() + + # Set cpu affinity for the container + set_cgroup_cpuset() + try: # Build up the command to perform the firmware update. # Note the hack to work around OPAE tool locale issues - cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 " - "-e LANG=en_US.UTF-8 -v " + DEVICE_IMAGE_CACHE_DIR + - ":" + "/mnt/images " + constants.OPAE_IMG + - " fpgasupdate -y --log-level debug /mnt/images/" + + cmd = ("ctr -n=k8s.io run --rm --privileged " + + "--env LC_ALL=en_US.UTF-8 --env LANG=en_US.UTF-8 " + + "--cgroup platform " + + "--mount type=bind,src=" + DEVICE_IMAGE_CACHE_DIR + + ",dst=/mnt/images,options=rbind:ro " + constants.OPAE_IMG + + " n3000-opae fpgasupdate -y --log-level debug /mnt/images/" + filename + " " + pci_addr) # Issue the command to perform the firmware update. subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable - stderr=subprocess.STDOUT) + stderr=subprocess.STDOUT) # TODO: switch to subprocess.Popen, parse the output and send # progress updates. except subprocess.CalledProcessError as exc: # Check the return code, send completion info to sysinv-conductor. - # "docker run" return code will be: - # 125 if the error is with Docker daemon itself - # 126 if the contained command cannot be invoked - # 127 if the contained command cannot be found - # Exit code of contained command otherwise msg = ("Failed to update device image %s for device %s, " "return code is %d, command output: %s." % (filename, pci_addr, exc.returncode, @@ -417,8 +460,8 @@ class FpgaAgentManager(service.PeriodicService): LOG.info('No config file for sysinv-fpga-agent found.') raise exception.ConfigNotFound(message="Unable to find sysinv config file!") - # Wait for puppet to log in to the local docker registry - wait_for_docker_login() + # Wait for puppet to finish resetting n3000 devices + wait_for_n3000_reset() # Wait around until someone else updates the platform.conf file # with our host UUID. self.wait_for_host_uuid() diff --git a/sysinv/sysinv/sysinv/sysinv/fpga_agent/reset_n3000_fpgas.py b/sysinv/sysinv/sysinv/sysinv/fpga_agent/reset_n3000_fpgas.py index 1430383d4c..68bf41178f 100644 --- a/sysinv/sysinv/sysinv/sysinv/fpga_agent/reset_n3000_fpgas.py +++ b/sysinv/sysinv/sysinv/sysinv/fpga_agent/reset_n3000_fpgas.py @@ -45,38 +45,63 @@ EEPROM_UPDATE_SUCCESS = '0x1111' def n3000_img_accessible(): - cmd = 'docker image list "%s" --format "{{.Repository}}:{{.Tag}}"' % \ - constants.OPAE_IMG + cmd = 'ctr -n=k8s.io image list name=="%s"' % constants.OPAE_IMG items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable - stderr=subprocess.STDOUT, - universal_newlines=True) + stderr=subprocess.STDOUT, + universal_newlines=True) for line in items.splitlines(): - if line == constants.OPAE_IMG: + if constants.OPAE_IMG in line: LOG.info('%s image found' % constants.OPAE_IMG) - return True + return constants.OPAE_IMG + LOG.info('%s image not found, check older image' % + constants.OPAE_IMG) + # During upgrade. check if previous version is available + cmd = 'ctr -n=k8s.io image list name=="%s"' % constants.OPAE_IMG_PREV + items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable + stderr=subprocess.STDOUT, + universal_newlines=True) + for line in items.splitlines(): + if constants.OPAE_IMG_PREV in line: + LOG.info('%s image found' % constants.OPAE_IMG_PREV) + return constants.OPAE_IMG_PREV + LOG.info('%s image not found, try image pull from controller' % + constants.OPAE_IMG_PREV) - LOG.info("%s image not found." % constants.OPAE_IMG) - return False + # n3000 image not found in containerd, get it from the controller + try: + subprocess.check_output(["crictl", "pull", constants.OPAE_IMG]) # pylint: disable=not-callable + LOG.info("Image %s imported by containerd" % constants.OPAE_IMG) + return constants.OPAE_IMG + except subprocess.CalledProcessError as exc: + msg = ("Failed to pull image %s, " + "return code is %d, command output: %s." % + (constants.OPAE_IMG, exc.returncode, exc.output)) + LOG.info(msg) + # During upgrade the current version is not available, + # try pulling the previous version + try: + subprocess.check_output(["crictl", "pull", constants.OPAE_IMG_PREV]) # pylint: disable=not-callable + LOG.info("Image %s imported by containerd" % constants.OPAE_IMG_PREV) + return constants.OPAE_IMG_PREV + except subprocess.CalledProcessError as exc: + msg = ("Failed to pull image %s, " + "return code is %d, command output: %s." % + (constants.OPAE_IMG_PREV, exc.returncode, exc.output)) + LOG.info(msg) + return None -def reset_device_n3000(pci_addr): +def reset_device_n3000(pci_addr, opae_img): # Reset the N3000 FPGA at the specified PCI address. try: # Build up the command to perform the reset. # Note the hack to work around OPAE tool locale issues - cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 " - "-e LANG=en_US.UTF-8 " + constants.OPAE_IMG + - " rsu bmcimg " + pci_addr) - + cmd = ("ctr -n=k8s.io run --rm --privileged " + opae_img + + " v1 rsu bmcimg " + pci_addr) # Issue the command to perform the firmware update. subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable - stderr=subprocess.STDOUT) + stderr=subprocess.STDOUT) except subprocess.CalledProcessError as exc: - # "docker run" return code will be: - # 125 if the error is with Docker daemon itself - # 126 if the contained command cannot be invoked - # 127 if the contained command cannot be found - # Exit code of contained command otherwise msg = ("Failed to reset device %s, " "return code is %d, command output: %s." % (pci_addr, exc.returncode, @@ -139,14 +164,14 @@ def reset_n3000_fpgas(): LOG.info("Resetting N3000 FPGAs.") got_exception = False fpga_addrs = get_n3000_devices() - if not n3000_img_accessible() and \ - not os.path.exists(constants.DOCKER_LOGIN_FLAG): - LOG.info("Either docker image or docker login is ready, exit...") + opae_img = n3000_img_accessible() + if opae_img is None: + LOG.info("n3000 opae image is not ready, exit...") return False for fpga_addr in fpga_addrs: try: - reset_device_n3000(fpga_addr) + reset_device_n3000(fpga_addr, opae_img) except Exception: got_exception = True @@ -158,7 +183,7 @@ def reset_n3000_fpgas(): LOG.info("Updating retimer") update_device_n3000_retimer(fpga_addr) LOG.info("Resetting N3000 second time") - reset_device_n3000(fpga_addr) + reset_device_n3000(fpga_addr, opae_img) except Exception: got_exception = True