Merge "Remove the use of docker for FPGA tools"

This commit is contained in:
Zuul 2021-10-07 14:21:32 +00:00 committed by Gerrit Code Review
commit 1eebb444ff
5 changed files with 113 additions and 126 deletions

View File

@ -34,10 +34,6 @@ install -p -D -m 644 sysinv-fpga-agent.service %{buildroot}%{_unitdir}/sysinv-fp
install -p -D -m 644 sysinv-conf-watcher.service %{buildroot}%{_unitdir}/sysinv-conf-watcher.service
install -p -D -m 644 sysinv-conf-watcher.path %{buildroot}%{_unitdir}/sysinv-conf-watcher.path
# Workaround to call "docker login" during startup. Called by puppet.
install -d -m 755 %{buildroot}%{_exec_prefix}/local/sbin
install -p -D -m 755 run_docker_login %{buildroot}%{_exec_prefix}/local/sbin/run_docker_login
%post
/usr/bin/systemctl enable sysinv-fpga-agent.service >/dev/null 2>&1
/usr/bin/systemctl enable sysinv-conf-watcher.service >/dev/null 2>&1
@ -54,4 +50,3 @@ rm -rf $RPM_BUILD_ROOT
%{_unitdir}/sysinv-fpga-agent.service
%{_unitdir}/sysinv-conf-watcher.service
%{_unitdir}/sysinv-conf-watcher.path
%{_exec_prefix}/local/sbin/run_docker_login

View File

@ -1,74 +0,0 @@
#!/bin/bash
#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
DOCKER_USERNAME=$1
DOCKER_PASSWORD=$2
LNAME=$(readlink -n -f $0)
NAME=$(basename $LNAME)
# Log info message to /var/log/daemon.log
function LOG {
logger -p daemon.info -t "${NAME}($$): " "$@"
}
# Wait around for the "registry.local" name to resolve
LOG "Waiting for registry.local to resolve"
while true
do
# We can't easily ask for both A and AAAA records in the same request
# because if the customer mis-configures things with a "good" nameserver
# and a non-existant nameserver dig will return "9" even though it finds
# an AAAA record on the "good" server. So we need to ask for A and AAAA
# records separately. Once we have either type of record we can proceed.
# First check for A records for IPv4
ADDR=`dig +short registry.local A`
if [ $? -eq 0 ]
then
# We got a response back from the server, but we need to check
# if we got an address or not. If there is no address, ADDR will
# be an empty string.
if [ -n "$ADDR" ]
then
LOG "registry.local resolved IPv4, continuing with docker login"
break
fi
fi
# Then check for AAAA records for IPv6
ADDR=`dig +short registry.local AAAA`
if [ $? -eq 0 ]
then
# We got a response back from the server, but we need to check
# if we got an address or not. If there is no address, ADDR will
# be an empty string.
if [ -n "$ADDR" ]
then
LOG "registry.local resolved IPv6, continuing with docker login"
break
fi
fi
sleep 1
done
while true
do
res=$(docker login --password-stdin -u ${DOCKER_USERNAME} registry.local:9001 2>&1 <<< ${DOCKER_PASSWORD})
rc=$?
if [ ${rc} -eq 0 ]
then
LOG "docker login to registry.local completed successfully"
touch /var/run/docker_login_done
break
else
LOG "docker login error ${rc} ${res}"
sleep 3
fi
done

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2020 Wind River Systems, Inc.
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -28,13 +28,11 @@ N3000_DEVICES = [
# TODO: Make this specified in the config file.
# This is the docker image containing the OPAE tools to access the FPGA device.
OPAE_IMG_PREV = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0"
OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.6.0-v1.0.1"
# This is a flag file created by puppet after doing a "docker login".
# We need to wait for it to exist before trying to run docker images.
DOCKER_LOGIN_FLAG = "/var/run/docker_login_done"
N3000_RESET_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_n3000_reset")
N3000_RESET_TIMEOUT = 600
# This flag is set if the N3000 requires a second reset
N3000_RETIMER_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH, ".sysinv_n3000_retimer")

View File

@ -17,7 +17,7 @@
# License for the specific language governing permissions and limitations
# under the License.
#
# Copyright (c) 2020 Wind River Systems, Inc.
# Copyright (c) 2020-2021 Wind River Systems, Inc.
#
@ -97,12 +97,17 @@ BMC_FW_VER_PATH = "bmcfw_flash_ctrl/bmcfw_version"
BMC_BUILD_VER_PATH = "max10_version"
def wait_for_docker_login():
# TODO: add a timeout
LOG.info("Waiting for docker login flag.")
while not os.path.exists(constants.DOCKER_LOGIN_FLAG):
def wait_for_n3000_reset():
LOG.info("Waiting for n3000 reset flag.")
timeout = 0
while not os.path.exists(constants.N3000_RESET_FLAG):
if timeout > constants.N3000_RESET_TIMEOUT:
msg = ("Timeout waiting for n3000 reset flag")
LOG.info(msg)
return
time.sleep(1)
LOG.info("Found docker login flag, continuing.")
timeout += 1
LOG.info("Found n3000 reset flag, continuing.")
def ensure_device_image_cache_exists():
@ -151,31 +156,69 @@ def fetch_device_image(filename):
return local_path
def cleanup_container():
# Delete container if exists
cmd = 'ctr -n=k8s.io container list image=="%s"' % constants.OPAE_IMG
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT,
universal_newlines=True)
for line in items.splitlines():
if constants.OPAE_IMG in line:
cmd = 'ctr -n=k8s.io container rm n3000-opae'
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT,
universal_newlines=True)
LOG.info('Deleted stale container n3000-opae')
break
def set_cgroup_cpuset():
# Set CPU affinity by updating the cpuset.cpus
platform_cpulist = '0'
cpuset_path = '/sys/fs/cgroup/cpuset/platform/'
cpuset_file = os.path.join(cpuset_path, 'cpuset.cpus')
if not os.path.exists(cpuset_path):
os.makedirs(cpuset_path)
with open('/etc/platform/worker_reserved.conf', 'r') as infile:
for line in infile:
if "PLATFORM_CPU_LIST" in line:
val = line.split("=")
platform_cpulist = val[1].strip('\n')[1:-1].strip('"')
with open(cpuset_file, 'w') as fd:
LOG.info("Writing %s to file %s" % (platform_cpulist, cpuset_file))
fd.write(platform_cpulist)
def write_device_image_n3000(filename, pci_addr):
# Write the firmware image to the FPGA at the specified PCI address.
# We're assuming that the image update tools will catch the scenario
# where the image is not compatible with the device.
# If the container exists, the host probably rebooted during
# a device update. Delete the container.
cleanup_container()
# Set cpu affinity for the container
set_cgroup_cpuset()
try:
# Build up the command to perform the firmware update.
# Note the hack to work around OPAE tool locale issues
cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 "
"-e LANG=en_US.UTF-8 -v " + DEVICE_IMAGE_CACHE_DIR +
":" + "/mnt/images " + constants.OPAE_IMG +
" fpgasupdate -y --log-level debug /mnt/images/" +
cmd = ("ctr -n=k8s.io run --rm --privileged " +
"--env LC_ALL=en_US.UTF-8 --env LANG=en_US.UTF-8 " +
"--cgroup platform " +
"--mount type=bind,src=" + DEVICE_IMAGE_CACHE_DIR +
",dst=/mnt/images,options=rbind:ro " + constants.OPAE_IMG +
" n3000-opae fpgasupdate -y --log-level debug /mnt/images/" +
filename + " " + pci_addr)
# Issue the command to perform the firmware update.
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT)
stderr=subprocess.STDOUT)
# TODO: switch to subprocess.Popen, parse the output and send
# progress updates.
except subprocess.CalledProcessError as exc:
# Check the return code, send completion info to sysinv-conductor.
# "docker run" return code will be:
# 125 if the error is with Docker daemon itself
# 126 if the contained command cannot be invoked
# 127 if the contained command cannot be found
# Exit code of contained command otherwise
msg = ("Failed to update device image %s for device %s, "
"return code is %d, command output: %s." %
(filename, pci_addr, exc.returncode,
@ -417,8 +460,8 @@ class FpgaAgentManager(service.PeriodicService):
LOG.info('No config file for sysinv-fpga-agent found.')
raise exception.ConfigNotFound(message="Unable to find sysinv config file!")
# Wait for puppet to log in to the local docker registry
wait_for_docker_login()
# Wait for puppet to finish resetting n3000 devices
wait_for_n3000_reset()
# Wait around until someone else updates the platform.conf file
# with our host UUID.
self.wait_for_host_uuid()

View File

@ -45,38 +45,63 @@ EEPROM_UPDATE_SUCCESS = '0x1111'
def n3000_img_accessible():
cmd = 'docker image list "%s" --format "{{.Repository}}:{{.Tag}}"' % \
constants.OPAE_IMG
cmd = 'ctr -n=k8s.io image list name=="%s"' % constants.OPAE_IMG
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT,
universal_newlines=True)
stderr=subprocess.STDOUT,
universal_newlines=True)
for line in items.splitlines():
if line == constants.OPAE_IMG:
if constants.OPAE_IMG in line:
LOG.info('%s image found' % constants.OPAE_IMG)
return True
return constants.OPAE_IMG
LOG.info('%s image not found, check older image' %
constants.OPAE_IMG)
# During upgrade. check if previous version is available
cmd = 'ctr -n=k8s.io image list name=="%s"' % constants.OPAE_IMG_PREV
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT,
universal_newlines=True)
for line in items.splitlines():
if constants.OPAE_IMG_PREV in line:
LOG.info('%s image found' % constants.OPAE_IMG_PREV)
return constants.OPAE_IMG_PREV
LOG.info('%s image not found, try image pull from controller' %
constants.OPAE_IMG_PREV)
LOG.info("%s image not found." % constants.OPAE_IMG)
return False
# n3000 image not found in containerd, get it from the controller
try:
subprocess.check_output(["crictl", "pull", constants.OPAE_IMG]) # pylint: disable=not-callable
LOG.info("Image %s imported by containerd" % constants.OPAE_IMG)
return constants.OPAE_IMG
except subprocess.CalledProcessError as exc:
msg = ("Failed to pull image %s, "
"return code is %d, command output: %s." %
(constants.OPAE_IMG, exc.returncode, exc.output))
LOG.info(msg)
# During upgrade the current version is not available,
# try pulling the previous version
try:
subprocess.check_output(["crictl", "pull", constants.OPAE_IMG_PREV]) # pylint: disable=not-callable
LOG.info("Image %s imported by containerd" % constants.OPAE_IMG_PREV)
return constants.OPAE_IMG_PREV
except subprocess.CalledProcessError as exc:
msg = ("Failed to pull image %s, "
"return code is %d, command output: %s." %
(constants.OPAE_IMG_PREV, exc.returncode, exc.output))
LOG.info(msg)
return None
def reset_device_n3000(pci_addr):
def reset_device_n3000(pci_addr, opae_img):
# Reset the N3000 FPGA at the specified PCI address.
try:
# Build up the command to perform the reset.
# Note the hack to work around OPAE tool locale issues
cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 "
"-e LANG=en_US.UTF-8 " + constants.OPAE_IMG +
" rsu bmcimg " + pci_addr)
cmd = ("ctr -n=k8s.io run --rm --privileged " + opae_img +
" v1 rsu bmcimg " + pci_addr)
# Issue the command to perform the firmware update.
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
stderr=subprocess.STDOUT)
stderr=subprocess.STDOUT)
except subprocess.CalledProcessError as exc:
# "docker run" return code will be:
# 125 if the error is with Docker daemon itself
# 126 if the contained command cannot be invoked
# 127 if the contained command cannot be found
# Exit code of contained command otherwise
msg = ("Failed to reset device %s, "
"return code is %d, command output: %s." %
(pci_addr, exc.returncode,
@ -139,14 +164,14 @@ def reset_n3000_fpgas():
LOG.info("Resetting N3000 FPGAs.")
got_exception = False
fpga_addrs = get_n3000_devices()
if not n3000_img_accessible() and \
not os.path.exists(constants.DOCKER_LOGIN_FLAG):
LOG.info("Either docker image or docker login is ready, exit...")
opae_img = n3000_img_accessible()
if opae_img is None:
LOG.info("n3000 opae image is not ready, exit...")
return False
for fpga_addr in fpga_addrs:
try:
reset_device_n3000(fpga_addr)
reset_device_n3000(fpga_addr, opae_img)
except Exception:
got_exception = True
@ -158,7 +183,7 @@ def reset_n3000_fpgas():
LOG.info("Updating retimer")
update_device_n3000_retimer(fpga_addr)
LOG.info("Resetting N3000 second time")
reset_device_n3000(fpga_addr)
reset_device_n3000(fpga_addr, opae_img)
except Exception:
got_exception = True