Merge "Remove the use of docker for FPGA tools"
This commit is contained in:
commit
1eebb444ff
|
@ -34,10 +34,6 @@ install -p -D -m 644 sysinv-fpga-agent.service %{buildroot}%{_unitdir}/sysinv-fp
|
|||
install -p -D -m 644 sysinv-conf-watcher.service %{buildroot}%{_unitdir}/sysinv-conf-watcher.service
|
||||
install -p -D -m 644 sysinv-conf-watcher.path %{buildroot}%{_unitdir}/sysinv-conf-watcher.path
|
||||
|
||||
# Workaround to call "docker login" during startup. Called by puppet.
|
||||
install -d -m 755 %{buildroot}%{_exec_prefix}/local/sbin
|
||||
install -p -D -m 755 run_docker_login %{buildroot}%{_exec_prefix}/local/sbin/run_docker_login
|
||||
|
||||
%post
|
||||
/usr/bin/systemctl enable sysinv-fpga-agent.service >/dev/null 2>&1
|
||||
/usr/bin/systemctl enable sysinv-conf-watcher.service >/dev/null 2>&1
|
||||
|
@ -54,4 +50,3 @@ rm -rf $RPM_BUILD_ROOT
|
|||
%{_unitdir}/sysinv-fpga-agent.service
|
||||
%{_unitdir}/sysinv-conf-watcher.service
|
||||
%{_unitdir}/sysinv-conf-watcher.path
|
||||
%{_exec_prefix}/local/sbin/run_docker_login
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2020 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
DOCKER_USERNAME=$1
|
||||
DOCKER_PASSWORD=$2
|
||||
|
||||
LNAME=$(readlink -n -f $0)
|
||||
NAME=$(basename $LNAME)
|
||||
|
||||
# Log info message to /var/log/daemon.log
|
||||
function LOG {
|
||||
logger -p daemon.info -t "${NAME}($$): " "$@"
|
||||
}
|
||||
|
||||
|
||||
# Wait around for the "registry.local" name to resolve
|
||||
LOG "Waiting for registry.local to resolve"
|
||||
while true
|
||||
do
|
||||
# We can't easily ask for both A and AAAA records in the same request
|
||||
# because if the customer mis-configures things with a "good" nameserver
|
||||
# and a non-existant nameserver dig will return "9" even though it finds
|
||||
# an AAAA record on the "good" server. So we need to ask for A and AAAA
|
||||
# records separately. Once we have either type of record we can proceed.
|
||||
|
||||
# First check for A records for IPv4
|
||||
ADDR=`dig +short registry.local A`
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
# We got a response back from the server, but we need to check
|
||||
# if we got an address or not. If there is no address, ADDR will
|
||||
# be an empty string.
|
||||
if [ -n "$ADDR" ]
|
||||
then
|
||||
LOG "registry.local resolved IPv4, continuing with docker login"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
|
||||
# Then check for AAAA records for IPv6
|
||||
ADDR=`dig +short registry.local AAAA`
|
||||
if [ $? -eq 0 ]
|
||||
then
|
||||
# We got a response back from the server, but we need to check
|
||||
# if we got an address or not. If there is no address, ADDR will
|
||||
# be an empty string.
|
||||
if [ -n "$ADDR" ]
|
||||
then
|
||||
LOG "registry.local resolved IPv6, continuing with docker login"
|
||||
break
|
||||
fi
|
||||
fi
|
||||
|
||||
sleep 1
|
||||
done
|
||||
|
||||
while true
|
||||
do
|
||||
res=$(docker login --password-stdin -u ${DOCKER_USERNAME} registry.local:9001 2>&1 <<< ${DOCKER_PASSWORD})
|
||||
rc=$?
|
||||
if [ ${rc} -eq 0 ]
|
||||
then
|
||||
LOG "docker login to registry.local completed successfully"
|
||||
touch /var/run/docker_login_done
|
||||
break
|
||||
else
|
||||
LOG "docker login error ${rc} ${res}"
|
||||
sleep 3
|
||||
fi
|
||||
done
|
|
@ -1,5 +1,5 @@
|
|||
#
|
||||
# Copyright (c) 2020 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
@ -28,13 +28,11 @@ N3000_DEVICES = [
|
|||
|
||||
# TODO: Make this specified in the config file.
|
||||
# This is the docker image containing the OPAE tools to access the FPGA device.
|
||||
OPAE_IMG_PREV = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.4.0-v1.0.0"
|
||||
OPAE_IMG = "registry.local:9001/docker.io/starlingx/n3000-opae:stx.6.0-v1.0.1"
|
||||
|
||||
# This is a flag file created by puppet after doing a "docker login".
|
||||
# We need to wait for it to exist before trying to run docker images.
|
||||
DOCKER_LOGIN_FLAG = "/var/run/docker_login_done"
|
||||
|
||||
N3000_RESET_FLAG = os.path.join(tsc.VOLATILE_PATH, ".sysinv_n3000_reset")
|
||||
N3000_RESET_TIMEOUT = 600
|
||||
|
||||
# This flag is set if the N3000 requires a second reset
|
||||
N3000_RETIMER_FLAG = os.path.join(tsc.PLATFORM_CONF_PATH, ".sysinv_n3000_retimer")
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
#
|
||||
# Copyright (c) 2020 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2021 Wind River Systems, Inc.
|
||||
#
|
||||
|
||||
|
||||
|
@ -97,12 +97,17 @@ BMC_FW_VER_PATH = "bmcfw_flash_ctrl/bmcfw_version"
|
|||
BMC_BUILD_VER_PATH = "max10_version"
|
||||
|
||||
|
||||
def wait_for_docker_login():
|
||||
# TODO: add a timeout
|
||||
LOG.info("Waiting for docker login flag.")
|
||||
while not os.path.exists(constants.DOCKER_LOGIN_FLAG):
|
||||
def wait_for_n3000_reset():
|
||||
LOG.info("Waiting for n3000 reset flag.")
|
||||
timeout = 0
|
||||
while not os.path.exists(constants.N3000_RESET_FLAG):
|
||||
if timeout > constants.N3000_RESET_TIMEOUT:
|
||||
msg = ("Timeout waiting for n3000 reset flag")
|
||||
LOG.info(msg)
|
||||
return
|
||||
time.sleep(1)
|
||||
LOG.info("Found docker login flag, continuing.")
|
||||
timeout += 1
|
||||
LOG.info("Found n3000 reset flag, continuing.")
|
||||
|
||||
|
||||
def ensure_device_image_cache_exists():
|
||||
|
@ -151,31 +156,69 @@ def fetch_device_image(filename):
|
|||
return local_path
|
||||
|
||||
|
||||
def cleanup_container():
|
||||
# Delete container if exists
|
||||
cmd = 'ctr -n=k8s.io container list image=="%s"' % constants.OPAE_IMG
|
||||
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True)
|
||||
for line in items.splitlines():
|
||||
if constants.OPAE_IMG in line:
|
||||
cmd = 'ctr -n=k8s.io container rm n3000-opae'
|
||||
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True)
|
||||
LOG.info('Deleted stale container n3000-opae')
|
||||
break
|
||||
|
||||
|
||||
def set_cgroup_cpuset():
|
||||
# Set CPU affinity by updating the cpuset.cpus
|
||||
platform_cpulist = '0'
|
||||
cpuset_path = '/sys/fs/cgroup/cpuset/platform/'
|
||||
cpuset_file = os.path.join(cpuset_path, 'cpuset.cpus')
|
||||
if not os.path.exists(cpuset_path):
|
||||
os.makedirs(cpuset_path)
|
||||
with open('/etc/platform/worker_reserved.conf', 'r') as infile:
|
||||
for line in infile:
|
||||
if "PLATFORM_CPU_LIST" in line:
|
||||
val = line.split("=")
|
||||
platform_cpulist = val[1].strip('\n')[1:-1].strip('"')
|
||||
with open(cpuset_file, 'w') as fd:
|
||||
LOG.info("Writing %s to file %s" % (platform_cpulist, cpuset_file))
|
||||
fd.write(platform_cpulist)
|
||||
|
||||
|
||||
def write_device_image_n3000(filename, pci_addr):
|
||||
# Write the firmware image to the FPGA at the specified PCI address.
|
||||
# We're assuming that the image update tools will catch the scenario
|
||||
# where the image is not compatible with the device.
|
||||
|
||||
# If the container exists, the host probably rebooted during
|
||||
# a device update. Delete the container.
|
||||
cleanup_container()
|
||||
|
||||
# Set cpu affinity for the container
|
||||
set_cgroup_cpuset()
|
||||
|
||||
try:
|
||||
# Build up the command to perform the firmware update.
|
||||
# Note the hack to work around OPAE tool locale issues
|
||||
cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 "
|
||||
"-e LANG=en_US.UTF-8 -v " + DEVICE_IMAGE_CACHE_DIR +
|
||||
":" + "/mnt/images " + constants.OPAE_IMG +
|
||||
" fpgasupdate -y --log-level debug /mnt/images/" +
|
||||
cmd = ("ctr -n=k8s.io run --rm --privileged " +
|
||||
"--env LC_ALL=en_US.UTF-8 --env LANG=en_US.UTF-8 " +
|
||||
"--cgroup platform " +
|
||||
"--mount type=bind,src=" + DEVICE_IMAGE_CACHE_DIR +
|
||||
",dst=/mnt/images,options=rbind:ro " + constants.OPAE_IMG +
|
||||
" n3000-opae fpgasupdate -y --log-level debug /mnt/images/" +
|
||||
filename + " " + pci_addr)
|
||||
|
||||
# Issue the command to perform the firmware update.
|
||||
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
|
||||
stderr=subprocess.STDOUT)
|
||||
stderr=subprocess.STDOUT)
|
||||
# TODO: switch to subprocess.Popen, parse the output and send
|
||||
# progress updates.
|
||||
except subprocess.CalledProcessError as exc:
|
||||
# Check the return code, send completion info to sysinv-conductor.
|
||||
# "docker run" return code will be:
|
||||
# 125 if the error is with Docker daemon itself
|
||||
# 126 if the contained command cannot be invoked
|
||||
# 127 if the contained command cannot be found
|
||||
# Exit code of contained command otherwise
|
||||
msg = ("Failed to update device image %s for device %s, "
|
||||
"return code is %d, command output: %s." %
|
||||
(filename, pci_addr, exc.returncode,
|
||||
|
@ -417,8 +460,8 @@ class FpgaAgentManager(service.PeriodicService):
|
|||
LOG.info('No config file for sysinv-fpga-agent found.')
|
||||
raise exception.ConfigNotFound(message="Unable to find sysinv config file!")
|
||||
|
||||
# Wait for puppet to log in to the local docker registry
|
||||
wait_for_docker_login()
|
||||
# Wait for puppet to finish resetting n3000 devices
|
||||
wait_for_n3000_reset()
|
||||
# Wait around until someone else updates the platform.conf file
|
||||
# with our host UUID.
|
||||
self.wait_for_host_uuid()
|
||||
|
|
|
@ -45,38 +45,63 @@ EEPROM_UPDATE_SUCCESS = '0x1111'
|
|||
|
||||
|
||||
def n3000_img_accessible():
|
||||
cmd = 'docker image list "%s" --format "{{.Repository}}:{{.Tag}}"' % \
|
||||
constants.OPAE_IMG
|
||||
cmd = 'ctr -n=k8s.io image list name=="%s"' % constants.OPAE_IMG
|
||||
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True)
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True)
|
||||
for line in items.splitlines():
|
||||
if line == constants.OPAE_IMG:
|
||||
if constants.OPAE_IMG in line:
|
||||
LOG.info('%s image found' % constants.OPAE_IMG)
|
||||
return True
|
||||
return constants.OPAE_IMG
|
||||
LOG.info('%s image not found, check older image' %
|
||||
constants.OPAE_IMG)
|
||||
# During upgrade. check if previous version is available
|
||||
cmd = 'ctr -n=k8s.io image list name=="%s"' % constants.OPAE_IMG_PREV
|
||||
items = subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
|
||||
stderr=subprocess.STDOUT,
|
||||
universal_newlines=True)
|
||||
for line in items.splitlines():
|
||||
if constants.OPAE_IMG_PREV in line:
|
||||
LOG.info('%s image found' % constants.OPAE_IMG_PREV)
|
||||
return constants.OPAE_IMG_PREV
|
||||
LOG.info('%s image not found, try image pull from controller' %
|
||||
constants.OPAE_IMG_PREV)
|
||||
|
||||
LOG.info("%s image not found." % constants.OPAE_IMG)
|
||||
return False
|
||||
# n3000 image not found in containerd, get it from the controller
|
||||
try:
|
||||
subprocess.check_output(["crictl", "pull", constants.OPAE_IMG]) # pylint: disable=not-callable
|
||||
LOG.info("Image %s imported by containerd" % constants.OPAE_IMG)
|
||||
return constants.OPAE_IMG
|
||||
except subprocess.CalledProcessError as exc:
|
||||
msg = ("Failed to pull image %s, "
|
||||
"return code is %d, command output: %s." %
|
||||
(constants.OPAE_IMG, exc.returncode, exc.output))
|
||||
LOG.info(msg)
|
||||
# During upgrade the current version is not available,
|
||||
# try pulling the previous version
|
||||
try:
|
||||
subprocess.check_output(["crictl", "pull", constants.OPAE_IMG_PREV]) # pylint: disable=not-callable
|
||||
LOG.info("Image %s imported by containerd" % constants.OPAE_IMG_PREV)
|
||||
return constants.OPAE_IMG_PREV
|
||||
except subprocess.CalledProcessError as exc:
|
||||
msg = ("Failed to pull image %s, "
|
||||
"return code is %d, command output: %s." %
|
||||
(constants.OPAE_IMG_PREV, exc.returncode, exc.output))
|
||||
LOG.info(msg)
|
||||
return None
|
||||
|
||||
|
||||
def reset_device_n3000(pci_addr):
|
||||
def reset_device_n3000(pci_addr, opae_img):
|
||||
# Reset the N3000 FPGA at the specified PCI address.
|
||||
try:
|
||||
# Build up the command to perform the reset.
|
||||
# Note the hack to work around OPAE tool locale issues
|
||||
cmd = ("docker run -t --privileged -e LC_ALL=en_US.UTF-8 "
|
||||
"-e LANG=en_US.UTF-8 " + constants.OPAE_IMG +
|
||||
" rsu bmcimg " + pci_addr)
|
||||
|
||||
cmd = ("ctr -n=k8s.io run --rm --privileged " + opae_img +
|
||||
" v1 rsu bmcimg " + pci_addr)
|
||||
# Issue the command to perform the firmware update.
|
||||
subprocess.check_output(shlex.split(cmd), # pylint: disable=not-callable
|
||||
stderr=subprocess.STDOUT)
|
||||
stderr=subprocess.STDOUT)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
# "docker run" return code will be:
|
||||
# 125 if the error is with Docker daemon itself
|
||||
# 126 if the contained command cannot be invoked
|
||||
# 127 if the contained command cannot be found
|
||||
# Exit code of contained command otherwise
|
||||
msg = ("Failed to reset device %s, "
|
||||
"return code is %d, command output: %s." %
|
||||
(pci_addr, exc.returncode,
|
||||
|
@ -139,14 +164,14 @@ def reset_n3000_fpgas():
|
|||
LOG.info("Resetting N3000 FPGAs.")
|
||||
got_exception = False
|
||||
fpga_addrs = get_n3000_devices()
|
||||
if not n3000_img_accessible() and \
|
||||
not os.path.exists(constants.DOCKER_LOGIN_FLAG):
|
||||
LOG.info("Either docker image or docker login is ready, exit...")
|
||||
opae_img = n3000_img_accessible()
|
||||
if opae_img is None:
|
||||
LOG.info("n3000 opae image is not ready, exit...")
|
||||
return False
|
||||
|
||||
for fpga_addr in fpga_addrs:
|
||||
try:
|
||||
reset_device_n3000(fpga_addr)
|
||||
reset_device_n3000(fpga_addr, opae_img)
|
||||
except Exception:
|
||||
got_exception = True
|
||||
|
||||
|
@ -158,7 +183,7 @@ def reset_n3000_fpgas():
|
|||
LOG.info("Updating retimer")
|
||||
update_device_n3000_retimer(fpga_addr)
|
||||
LOG.info("Resetting N3000 second time")
|
||||
reset_device_n3000(fpga_addr)
|
||||
reset_device_n3000(fpga_addr, opae_img)
|
||||
except Exception:
|
||||
got_exception = True
|
||||
|
||||
|
|
Loading…
Reference in New Issue