Remove the use of docker for FPGA tools

This update is to replace the use of docker container runtime with the
containerd ctr client for resetting the N3000 FPGA on node startup and
updating N3000 device images.

Test Plan:
- Tested that non-controller-0 nodes can pull n3000-opae image and reset
  n3000 devices at startup
- Tested device updates in AIO-SX, AIO-DX, standard and DC env
- Tested upgrade

Story: 2008972
Task: 43420
Depends-On: https://review.opendev.org/c/starlingx/containers/+/810688

Change-Id: I6012189aa716f76a7fe4f8a51f0fbfa27234b8bb
Signed-off-by: Teresa Ho <teresa.ho@windriver.com>
This commit is contained in:
Teresa Ho 2021-09-22 10:14:56 -04:00
parent ce5d15b6f4
commit f4971bb365
5 changed files with 113 additions and 126 deletions

View File

@ -34,10 +34,6 @@ install -p -D -m 644 sysinv-fpga-agent.service %{buildroot}%{_unitdir}/sysinv-fp
install -p -D -m 644 sysinv-conf-watcher.service %{buildroot}%{_unitdir}/sysinv-conf-watcher.service
install -p -D -m 644 sysinv-conf-watcher.path %{buildroot}%{_unitdir}/sysinv-conf-watcher.path
# Workaround to call "docker login" during startup. Called by puppet.
install -d -m 755 %{buildroot}%{_exec_prefix}/local/sbin
install -p -D -m 755 run_docker_login %{buildroot}%{_exec_prefix}/local/sbin/run_docker_login
%post
/usr/bin/systemctl enable sysinv-fpga-agent.service >/dev/null 2>&1
/usr/bin/systemctl enable sysinv-conf-watcher.service >/dev/null 2>&1
@ -54,4 +50,3 @@ rm -rf $RPM_BUILD_ROOT
%{_unitdir}/sysinv-fpga-agent.service
%{_unitdir}/sysinv-conf-watcher.service
%{_unitdir}/sysinv-conf-watcher.path
%{_exec_prefix}/local/sbin/run_docker_login

View File

@ -1,74 +0,0 @@
#!/bin/bash
#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
DOCKER_USERNAME=$1
DOCKER_PASSWORD=$2
LNAME=$(readlink -n -f $0)
NAME=$(basename $LNAME)
# Log info message to /var/log/daemon.log
function LOG {
logger -p daemon.info -t "${NAME}($$): " "$@"
}
# Wait around for the "registry.local" name to resolve
LOG "Waiting for registry.local to resolve"
while true
do
# We can't easily ask for both A and AAAA records in the same request
# because if the customer mis-configures things with a "good" nameserver
# and a non-existant nameserver dig will return "9" even though it finds
# an AAAA record on the "good" server. So we need to ask for A and AAAA
# records separately. Once we have either type of record we can proceed.
# First check for A records for IPv4
ADDR=`dig +short registry.local A`
if [ $? -eq 0 ]
then
# We got a response back from the server, but we need to check
# if we got an address or not. If there is no address, ADDR will
# be an empty string.
if [ -n "$ADDR" ]
then
LOG "registry.local resolved IPv4, continuing with docker login"
break
fi
fi
# Then check for AAAA records for IPv6
ADDR=`dig +short registry.local AAAA`
if [ $? -eq 0 ]
then
# We got a response back from the server, but we need to check
# if we got an address or not. If there is no address, ADDR will
# be an empty string.
if [ -n "$ADDR" ]
then
LOG "registry.local resolved IPv6, continuing with docker login"
break
fi
fi
sleep 1
done
while true
do
res=$(docker login --password-stdin -u ${DOCKER_USERNAME} registry.local:9001 2>&1 <<< ${DOCKER_PASSWORD})
rc=$?
if [ ${rc} -eq 0 ]
then
LOG "docker login to registry.local completed successfully"
touch /var/run/docker_login_done
break
else
LOG "docker login error ${rc} ${res}"
sleep 3
fi
done

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020 Wind River Systems, Inc.
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -28,13 +28,11 @@ N3000_DEVICES = [
# TODO: Make this specified in the config file.
# This is the docker image containing the OPAE tools to access the FPGA device.
OPAE_IMG_PREV = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0"
OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.6.0-v1.0.1"
# This is a flag file created by puppet after doing a "docker login".
# We need to wait for it to exist before trying to run docker images.
DOCKER_LOGIN_FLAG = "/var/run/docker_login_done"
N3000_RESET_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_n3000_reset")
N3000_RESET_TIMEOUT = 600
# This flag is set if the N3000 requires a second reset
N3000_RETIMER_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH, ".sysinv_n3000_retimer")

View File

@ -17,7 +17,7 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2020 Wind River Systems, Inc.
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
@ -97,12 +97,17 @@ BMC_FW_VER_PATH = "bmcfw_flash_ctrl/bmcfw_version"
BMC_BUILD_VER_PATH = "max10_version"
def wait_for_docker_login():
# TODO: add a timeout
LOG.info("Waiting for docker login flag.")
while not os.path.exists(constants.DOCKER_LOGIN_FLAG):
def wait_for_n3000_reset():
LOG.info("Waiting for n3000 reset flag.")
timeout = 0
while not os.path.exists(constants.N3000_RESET_FLAG):
if timeout > constants.N3000_RESET_TIMEOUT:
msg = ("Timeout waiting for n3000 reset flag")
LOG.info(msg)
return
time.sleep(1)
LOG.info("Found docker login flag, continuing.")
timeout += 1
LOG.info("Found n3000 reset flag, continuing.")
def ensure_device_image_cache_exists():
@ -151,31 +156,69 @@ def fetch_device_image(filename):
return local_path
def cleanup_container():
# Delete container if exists
cmd = 'ctr -n=k8s.io container list image=="%s"' % constants.OPAE_IMG
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT,
universal_newlines=True)
for line in items.splitlines():
if constants.OPAE_IMG in line:
cmd = 'ctr -n=k8s.io container rm n3000-opae'
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT,
universal_newlines=True)
LOG.info('Deleted stale container n3000-opae')
break
def set_cgroup_cpuset():
# Set CPU affinity by updating the cpuset.cpus
platform_cpulist = '0'
cpuset_path = '/sys/fs/cgroup/cpuset/platform/'
cpuset_file = os.path.join(cpuset_path, 'cpuset.cpus')
if not os.path.exists(cpuset_path):
os.makedirs(cpuset_path)
with open('/etc/platform/worker_reserved.conf', 'r') as infile:
for line in infile:
if "PLATFORM_CPU_LIST" in line:
val = line.split("=")
platform_cpulist = val[1].strip('\n')[1:-1].strip('"')
with open(cpuset_file, 'w') as fd:
LOG.info("Writing %s to file %s" % (platform_cpulist, cpuset_file))
fd.write(platform_cpulist)
def write_device_image_n3000(filename, pci_addr):
# Write the firmware image to the FPGA at the specified PCI address.
# We're assuming that the image update tools will catch the scenario
# where the image is not compatible with the device.
# If the container exists, the host probably rebooted during
# a device update. Delete the container.
cleanup_container()
# Set cpu affinity for the container
set_cgroup_cpuset()
try:
# Build up the command to perform the firmware update.
# Note the hack to work around OPAE tool locale issues
cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 "
"-e LANG=en_US.UTF-8 -v " + DEVICE_IMAGE_CACHE_DIR +
":" + "/mnt/images " + constants.OPAE_IMG +
" fpgasupdate -y --log-level debug /mnt/images/" +
cmd = ("ctr -n=k8s.io run --rm --privileged " +
"--env LC_ALL=en_US.UTF-8 --env LANG=en_US.UTF-8 " +
"--cgroup platform " +
"--mount type=bind,src=" + DEVICE_IMAGE_CACHE_DIR +
",dst=/mnt/images,options=rbind:ro " + constants.OPAE_IMG +
" n3000-opae fpgasupdate -y --log-level debug /mnt/images/" +
filename + " " + pci_addr)
# Issue the command to perform the firmware update.
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT)
stderr=subprocess.STDOUT)
# TODO: switch to subprocess.Popen, parse the output and send
# progress updates.
except subprocess.CalledProcessError as exc:
# Check the return code, send completion info to sysinv-conductor.
# "docker run" return code will be:
# 125 if the error is with Docker daemon itself
# 126 if the contained command cannot be invoked
# 127 if the contained command cannot be found
# Exit code of contained command otherwise
msg = ("Failed to update device image %s for device %s, "
"return code is %d, command output: %s." %
(filename, pci_addr, exc.returncode,
@ -417,8 +460,8 @@ class FpgaAgentManager(service.PeriodicService):
LOG.info('No config file for sysinv-fpga-agent found.')
raise exception.ConfigNotFound(message="Unable to find sysinv config file!")
# Wait for puppet to log in to the local docker registry
wait_for_docker_login()
# Wait for puppet to finish resetting n3000 devices
wait_for_n3000_reset()
# Wait around until someone else updates the platform.conf file
# with our host UUID.
self.wait_for_host_uuid()

View File

@ -45,38 +45,63 @@ EEPROM_UPDATE_SUCCESS = '0x1111'
def n3000_img_accessible():
cmd = 'docker image list "%s" --format "{{.Repository}}:{{.Tag}}"' % \
constants.OPAE_IMG
cmd = 'ctr -n=k8s.io image list name=="%s"' % constants.OPAE_IMG
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT,
universal_newlines=True)
stderr=subprocess.STDOUT,
universal_newlines=True)
for line in items.splitlines():
if line == constants.OPAE_IMG:
if constants.OPAE_IMG in line:
LOG.info('%s image found' % constants.OPAE_IMG)
return True
return constants.OPAE_IMG
LOG.info('%s image not found, check older image' %
constants.OPAE_IMG)
# During upgrade. check if previous version is available
cmd = 'ctr -n=k8s.io image list name=="%s"' % constants.OPAE_IMG_PREV
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT,
universal_newlines=True)
for line in items.splitlines():
if constants.OPAE_IMG_PREV in line:
LOG.info('%s image found' % constants.OPAE_IMG_PREV)
return constants.OPAE_IMG_PREV
LOG.info('%s image not found, try image pull from controller' %
constants.OPAE_IMG_PREV)
LOG.info("%s image not found." % constants.OPAE_IMG)
return False
# n3000 image not found in containerd, get it from the controller
try:
subprocess.check_output(["crictl", "pull", constants.OPAE_IMG]) # pylint: disable=not-callable
LOG.info("Image %s imported by containerd" % constants.OPAE_IMG)
return constants.OPAE_IMG
except subprocess.CalledProcessError as exc:
msg = ("Failed to pull image %s, "
"return code is %d, command output: %s." %
(constants.OPAE_IMG, exc.returncode, exc.output))
LOG.info(msg)
# During upgrade the current version is not available,
# try pulling the previous version
try:
subprocess.check_output(["crictl", "pull", constants.OPAE_IMG_PREV]) # pylint: disable=not-callable
LOG.info("Image %s imported by containerd" % constants.OPAE_IMG_PREV)
return constants.OPAE_IMG_PREV
except subprocess.CalledProcessError as exc:
msg = ("Failed to pull image %s, "
"return code is %d, command output: %s." %
(constants.OPAE_IMG_PREV, exc.returncode, exc.output))
LOG.info(msg)
return None
def reset_device_n3000(pci_addr):
def reset_device_n3000(pci_addr, opae_img):
# Reset the N3000 FPGA at the specified PCI address.
try:
# Build up the command to perform the reset.
# Note the hack to work around OPAE tool locale issues
cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 "
"-e LANG=en_US.UTF-8 " + constants.OPAE_IMG +
" rsu bmcimg " + pci_addr)
cmd = ("ctr -n=k8s.io run --rm --privileged " + opae_img +
" v1 rsu bmcimg " + pci_addr)
# Issue the command to perform the firmware update.
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT)
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as exc:
# "docker run" return code will be:
# 125 if the error is with Docker daemon itself
# 126 if the contained command cannot be invoked
# 127 if the contained command cannot be found
# Exit code of contained command otherwise
msg = ("Failed to reset device %s, "
"return code is %d, command output: %s." %
(pci_addr, exc.returncode,
@ -139,14 +164,14 @@ def reset_n3000_fpgas():
LOG.info("Resetting N3000 FPGAs.")
got_exception = False
fpga_addrs = get_n3000_devices()
if not n3000_img_accessible() and \
not os.path.exists(constants.DOCKER_LOGIN_FLAG):
LOG.info("Either docker image or docker login is ready, exit...")
opae_img = n3000_img_accessible()
if opae_img is None:
LOG.info("n3000 opae image is not ready, exit...")
return False
for fpga_addr in fpga_addrs:
try:
reset_device_n3000(fpga_addr)
reset_device_n3000(fpga_addr, opae_img)
except Exception:
got_exception = True
@ -158,7 +183,7 @@ def reset_n3000_fpgas():
LOG.info("Updating retimer")
update_device_n3000_retimer(fpga_addr)
LOG.info("Resetting N3000 second time")
reset_device_n3000(fpga_addr)
reset_device_n3000(fpga_addr, opae_img)
except Exception:
got_exception = True