Add nvidia firstboot yaml to tripleo-heat-template
This's a temporary workaround for upgrading Nvidia mellanox NICs FW and configuring them using mstflint tool Change-Id: I40166051c10876ee9b865135f219cf0dff6d4c18
This commit is contained in:
parent
4403f2182b
commit
f78ba117a8
743
firstboot/nvidia_firstboot.yaml
Normal file
743
firstboot/nvidia_firstboot.yaml
Normal file
@ -0,0 +1,743 @@
|
||||
heat_template_version: wallaby
|
||||
|
||||
description: >
|
||||
This's a temporary workaround for upgrading Nvidia mellanox NICs
|
||||
FW and configuring them using mstflint tool
|
||||
|
||||
parameters:
|
||||
BIN_DIR_URL:
|
||||
type: string
|
||||
default: ''
|
||||
description: 'URL of a directory containing Mellanox NIC Firmware'
|
||||
FORCE_UPDATE:
|
||||
type: boolean
|
||||
default: False
|
||||
description: "Force update the fw even if it's an older version"
|
||||
DEV_WHITE_LIST:
|
||||
type: comma_delimited_list
|
||||
default: []
|
||||
description: list of MLNX devices PCIs to be processed.
|
||||
If the value is empty, all MLNX devices will be processed.
|
||||
Example, ['0000:04:00.0', '0000:81:00.0']
|
||||
Make sure to choose only the PCI ends with 0
|
||||
NUM_OF_VFS:
|
||||
type: number
|
||||
default: 32
|
||||
description: 'Max number of vfs'
|
||||
SRIOV_EN:
|
||||
type: boolean
|
||||
default: True
|
||||
description: 'Enable/Disable Sriov'
|
||||
LINK_TYPE:
|
||||
type: string
|
||||
default: 'eth'
|
||||
description: 'Link type ["eth", "ib"]'
|
||||
ESWITCH_IPV4_TTL_MODIFY_ENABLE:
|
||||
type: boolean
|
||||
default: False
|
||||
description: 'Enable TTL modification by E-Switch'
|
||||
PRIO_TAG_REQUIRED_EN:
|
||||
type: boolean
|
||||
default: False
|
||||
description: 'Priority tag required'
|
||||
ESWITCH_HAIRPIN_TOT_BUFFER_SIZE:
|
||||
type: json
|
||||
default: {"*": "17"}
|
||||
description: >
|
||||
If a single key of "*" is provided, then its value will set to all indexes.
|
||||
If you need to set configuration for a set of specific indexes, you can pass the
|
||||
value as below for index 2 to be 17 and index 3 to be 16
|
||||
Example, {"2": "17", "3": "16"}
|
||||
Make sure to choose only the PCI ends with 0
|
||||
ESWITCH_HAIRPIN_DESCRIPTORS:
|
||||
type: json
|
||||
default: {"*": "11"}
|
||||
description: >
|
||||
If a single key of "*" is provided, then its value will set to all indexes.
|
||||
If you need to set configuration for a set of specific indexes, you can pass the
|
||||
value as below for index 2 to be 17 and index 3 to be 16
|
||||
Example, {"2": "17", "3": "16"}
|
||||
Make sure to choose only the PCI ends with 0
|
||||
BF_NIC_MODE:
|
||||
type: boolean
|
||||
default: False
|
||||
description: >
|
||||
A special parameter for BlueField SmartNICs, if the value is False, that means the
|
||||
NIC in smart nic mode and it's going to set the following config:
|
||||
"INTERNAL_CPU_PAGE_SUPPLIER": "ECPF",
|
||||
"INTERNAL_CPU_ESWITCH_MANAGER": "ECPF",
|
||||
"INTERNAL_CPU_IB_VPORT0": "ECPF",
|
||||
"INTERNAL_CPU_OFFLOAD_ENGINE": "ENABLED",
|
||||
If the value is True, that means the NIC in nic mode and it's going
|
||||
to set the following config:
|
||||
"INTERNAL_CPU_PAGE_SUPPLIER", "EXT_HOST_PF"
|
||||
"INTERNAL_CPU_ESWITCH_MANAGER", "EXT_HOST_PF"
|
||||
"INTERNAL_CPU_IB_VPORT0", "EXT_HOST_PF"
|
||||
"INTERNAL_CPU_OFFLOAD_ENGINE", "DISABLED"
|
||||
RESET_SYNC:
|
||||
type: number
|
||||
default: 0
|
||||
description: >
|
||||
Run mstfwreset with the specified reset-sync [0,1],
|
||||
The default and Current supported option now is 0
|
||||
|
||||
|
||||
resources:
|
||||
userdata:
|
||||
type: OS::Heat::MultipartMime
|
||||
properties:
|
||||
parts:
|
||||
- config: {get_resource: nvidia_nic_fw_update}
|
||||
|
||||
nvidia_nic_fw_update:
|
||||
type: OS::Heat::SoftwareConfig
|
||||
properties:
|
||||
config:
|
||||
str_replace:
|
||||
template: |
|
||||
#!/usr/bin/env python
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import tempfile
|
||||
import threading
|
||||
|
||||
import six
|
||||
from six.moves import html_parser
|
||||
from six.moves.urllib import error as urlError
|
||||
from six.moves.urllib import request as urlRequest
|
||||
|
||||
from oslo_concurrency import processutils
|
||||
|
||||
FW_VERSION_REGEX = r'FW Version:\s*\t*(?P<fw_ver>\d+\.\d+\.\d+)'
|
||||
RUNNING_FW_VERSION_REGEX = r'FW Version\(Running\):\s*\t*(?P<fw_ver>\d+\.\d+\.\d+)'
|
||||
PSID_REGEX = r'PSID:\s*\t*(?P<psid>\w+)'
|
||||
ARRAY_VALUE_REGEX = r'Array\[(?P<first_index>\d+)\.\.(?P<last_index>\d+)\]'
|
||||
ARRAY_PARAM_REGEX = r'(?P<param_name>\w+)\[\d+\]'
|
||||
|
||||
_DEV_WHITE_LIST = $DEV_WHITE_LIST
|
||||
_FORCE_UPDATE = $FORCE_UPDATE
|
||||
_BIN_DIR_URL = "$BIN_DIR_URL"
|
||||
_BF_NIC_MODE = "$BF_NIC_MODE"
|
||||
# TODO(adrianc): add configurable parameter for logging
|
||||
logging.basicConfig(
|
||||
filename='/var/log/nvidia_nic_fw_update.log',
|
||||
filemode='w',
|
||||
level=logging.DEBUG)
|
||||
LOG = logging.getLogger("nvidia_nic_fw_update")
|
||||
|
||||
_MLX_CONFIG = {
|
||||
"SRIOV_EN": "$SRIOV_EN",
|
||||
"NUM_OF_VFS": "$NUM_OF_VFS",
|
||||
"LINK_TYPE_P1": "$LINK_TYPE",
|
||||
"LINK_TYPE_P2": "$LINK_TYPE",
|
||||
"ESWITCH_IPV4_TTL_MODIFY_ENABLE": "$ESWITCH_IPV4_TTL_MODIFY_ENABLE",
|
||||
"PRIO_TAG_REQUIRED_EN": "$PRIO_TAG_REQUIRED_EN",
|
||||
"ESWITCH_HAIRPIN_TOT_BUFFER_SIZE": $ESWITCH_HAIRPIN_TOT_BUFFER_SIZE,
|
||||
"ESWITCH_HAIRPIN_DESCRIPTORS": $ESWITCH_HAIRPIN_DESCRIPTORS
|
||||
}
|
||||
if _BF_NIC_MODE.lower() == "false":
|
||||
# It means we are in smart nic mode for BlueField device
|
||||
_MLX_CONFIG["INTERNAL_CPU_PAGE_SUPPLIER"] = "ECPF"
|
||||
_MLX_CONFIG["INTERNAL_CPU_ESWITCH_MANAGER"] = "ECPF"
|
||||
_MLX_CONFIG["INTERNAL_CPU_IB_VPORT0"] = "ECPF"
|
||||
_MLX_CONFIG["INTERNAL_CPU_OFFLOAD_ENGINE"] = "ENABLED"
|
||||
_MLX_CONFIG["INTERNAL_CPU_RSHIM"] = "True"
|
||||
else:
|
||||
# It measn we are in nic mode for BlueField device
|
||||
_MLX_CONFIG["INTERNAL_CPU_PAGE_SUPPLIER"] = "EXT_HOST_PF"
|
||||
_MLX_CONFIG["INTERNAL_CPU_ESWITCH_MANAGER"] = "EXT_HOST_PF"
|
||||
_MLX_CONFIG["INTERNAL_CPU_IB_VPORT0"] = "EXT_HOST_PF"
|
||||
_MLX_CONFIG["INTERNAL_CPU_OFFLOAD_ENGINE"] = "DISABLED"
|
||||
|
||||
_RESET_SYNC = $RESET_SYNC
|
||||
|
||||
def run_command(*cmd, **kwargs):
|
||||
try:
|
||||
out, err = processutils.execute(*cmd, **kwargs)
|
||||
except processutils.ProcessExecutionError as e:
|
||||
LOG.error("Failed to execute %s, %s", ' '.join(cmd), str(e))
|
||||
raise e
|
||||
if err:
|
||||
LOG.warning("Got stderr output: %s" % err)
|
||||
LOG.debug(out)
|
||||
return out
|
||||
|
||||
|
||||
def parse_mstflint_query_output(out):
|
||||
""" Parse Mstflint query output
|
||||
|
||||
For now just extract 'FW Version' and 'PSID'
|
||||
|
||||
:param out: mstflint query output
|
||||
:return: dictionary of query attributes
|
||||
"""
|
||||
query_info = {}
|
||||
for line in out.split('\n'):
|
||||
fw_ver = re.match(FW_VERSION_REGEX, line)
|
||||
psid = re.match(PSID_REGEX, line)
|
||||
running_fw_ver = re.match(RUNNING_FW_VERSION_REGEX, line)
|
||||
if fw_ver:
|
||||
query_info["fw_ver"] = fw_ver.group('fw_ver')
|
||||
if running_fw_ver:
|
||||
query_info["running_fw_ver"] = running_fw_ver.group('fw_ver')
|
||||
if psid:
|
||||
query_info["psid"] = psid.group('psid')
|
||||
return query_info
|
||||
|
||||
|
||||
class MlnxDevices(object):
|
||||
""" Discover and retrieve Mellanox PCI devices.
|
||||
|
||||
Can be used as an iterator once discover has been called.
|
||||
"""
|
||||
|
||||
def __init__(self, dev_white_list):
|
||||
self._devs = []
|
||||
self._dev_white_list = dev_white_list
|
||||
|
||||
def discover(self):
|
||||
""" Discover Mellanox devices in the system. (first PF of every device)
|
||||
|
||||
:return: None
|
||||
"""
|
||||
if self._devs:
|
||||
return self._devs
|
||||
|
||||
devs = []
|
||||
cmd = ['lspci', '-D', '-d', '15b3:']
|
||||
out = run_command(*cmd)
|
||||
for line in out.split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
dev = line.split()[0]
|
||||
if dev.endswith('.0') and (not self._dev_white_list or
|
||||
dev in self._dev_white_list):
|
||||
devs.append(dev)
|
||||
self._devs = devs
|
||||
LOG.info("Found Mellanox devices: %s", devs)
|
||||
other_devs = set(self._dev_white_list) - set(devs)
|
||||
if other_devs:
|
||||
LOG.warning("Not all devices in PCI white list where discovered,"
|
||||
" %s these may not be nvidia devices or have their "
|
||||
"PCI function set to non zero." % other_devs)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._devs)
|
||||
|
||||
def __iter__(self):
|
||||
return self._devs.__iter__()
|
||||
|
||||
|
||||
class MlnxConfigParamMetaData(object):
|
||||
""" Metadata about a single mlxconfig parameter"""
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
|
||||
class MlnxConfigArrayParamMetaData(MlnxConfigParamMetaData):
|
||||
""" Metadata about a single mlxconfig array parameter"""
|
||||
|
||||
def __init__(self, name, first_idx, last_idx):
|
||||
super(MlnxConfigArrayParamMetaData, self).__init__(name)
|
||||
self.first_index = int(first_idx)
|
||||
self.last_index = int(last_idx)
|
||||
|
||||
|
||||
class MlnxDeviceConfig(object):
|
||||
""" Get/Set Mellanox Device configurations
|
||||
"""
|
||||
|
||||
def __init__(self, pci_dev):
|
||||
self.pci_dev = pci_dev
|
||||
self._tool_confs = None
|
||||
# NOTE(adrianc) ATM contains only array type parameter metadata
|
||||
self.mlnx_config_array_param_metadata = {}
|
||||
|
||||
def _mstconfig_parse_data(self, data):
|
||||
# Parsing the mstconfig out to json
|
||||
data = list(filter(None, data.split('\n')))
|
||||
r = {}
|
||||
c = 0
|
||||
for line in data:
|
||||
c += 1
|
||||
if 'Configurations:' in line:
|
||||
break
|
||||
for i in range(c, len(data)):
|
||||
d = list(filter(None, data[i].strip().split()))
|
||||
r[d[0]] = d[1]
|
||||
return r
|
||||
|
||||
def get_device_conf_dict(self, param_name=None):
|
||||
""" Get device Configurations
|
||||
|
||||
:param param_name: if provided retireve only given configuration
|
||||
:return: dict {"PARAM_NAME": "Param value", ....}
|
||||
"""
|
||||
LOG.info("Getting configurations for device: %s" % self.pci_dev)
|
||||
cmd = ["mstconfig", "-d", self.pci_dev, "q"]
|
||||
if param_name:
|
||||
cmd.append(param_name)
|
||||
out = run_command(*cmd)
|
||||
return self._mstconfig_parse_data(out)
|
||||
|
||||
def param_supp_by_config_tool(self, param_name):
|
||||
""" Check if configuration tool supports the provided configuration
|
||||
parameter.
|
||||
|
||||
:param param_name: configuration name
|
||||
:return: bool
|
||||
"""
|
||||
if self._tool_confs is None:
|
||||
self._tool_confs = run_command(
|
||||
"mstconfig", "-d", self.pci_dev, "i")
|
||||
# trim any array index if present
|
||||
indexed_param = re.match(ARRAY_PARAM_REGEX, param_name)
|
||||
if indexed_param:
|
||||
param_name = indexed_param.group('param_name')
|
||||
return param_name in self._tool_confs
|
||||
|
||||
def _build_config_param_metadata_map(self, conf_dict):
|
||||
self.mlnx_config_array_param_metadata = {}
|
||||
for param_name, val in six.iteritems(conf_dict):
|
||||
array_val = re.match(ARRAY_VALUE_REGEX, val)
|
||||
if array_val:
|
||||
# Array parameter, extract first/last index
|
||||
first_index = array_val.group('first_index')
|
||||
last_index = array_val.group('last_index')
|
||||
self.mlnx_config_array_param_metadata[param_name] = \
|
||||
MlnxConfigArrayParamMetaData(
|
||||
param_name, first_index, last_index)
|
||||
|
||||
def _inflate_array_param_vals_from_query(self, conf_dict):
|
||||
""" Inflate provided conf dict with all values of array parameter"""
|
||||
inflated_conf = {}
|
||||
for param_name, val in six.iteritems(conf_dict):
|
||||
if param_name in self.mlnx_config_array_param_metadata:
|
||||
param_meta = self.mlnx_config_array_param_metadata[param_name]
|
||||
first = param_meta.first_index
|
||||
last = param_meta.last_index
|
||||
arr_param_val = self.get_device_conf_dict(
|
||||
param_name="%s[%s..%s]" % (param_name, first, last))
|
||||
# Add new keys to dict
|
||||
for k, v in six.iteritems(arr_param_val):
|
||||
inflated_conf[k] = v
|
||||
else:
|
||||
inflated_conf[param_name] = val
|
||||
return inflated_conf
|
||||
|
||||
def _inflate_single_array_input_val(self, param_name, val):
|
||||
conf_dict = {}
|
||||
param_meta = self.mlnx_config_array_param_metadata[param_name]
|
||||
first = param_meta.first_index
|
||||
last = param_meta.last_index
|
||||
|
||||
if '*' in val:
|
||||
if len(val) != 1:
|
||||
raise RuntimeError(
|
||||
"Invalid input for provided array type parameter. %s:%s"
|
||||
% (param_name, val))
|
||||
|
||||
for idx in range(first, last + 1):
|
||||
conf_dict["%s[%s]" % (param_name, idx)] = val['*']
|
||||
else:
|
||||
for idx, idx_val in six.iteritems(val):
|
||||
if int(idx) not in range(first, last + 1):
|
||||
LOG.warning(
|
||||
"Provided array param index(%s) is out of range "
|
||||
"[%s..%s] skipping...", idx, first, last)
|
||||
continue
|
||||
conf_dict["%s[%s]" % (param_name, idx)] = str(idx_val)
|
||||
return conf_dict
|
||||
|
||||
def _inflate_array_param_vals_from_input(self, conf_dict):
|
||||
""" Inflate provided conf dict with all values of array parameter"""
|
||||
inflated_conf = {}
|
||||
for param_name, val in six.iteritems(conf_dict):
|
||||
if param_name in self.mlnx_config_array_param_metadata:
|
||||
exp_inp = self._inflate_single_array_input_val(param_name, val)
|
||||
# Add to conf_dict
|
||||
for k, v in six.iteritems(exp_inp):
|
||||
inflated_conf[k] = v
|
||||
else:
|
||||
inflated_conf[param_name] = val
|
||||
|
||||
return inflated_conf
|
||||
|
||||
def set_config(self, conf_dict):
|
||||
""" Set device configurations
|
||||
|
||||
:param conf_dict: a dictionary of:
|
||||
{"PARAM_NAME": "Param value to set", ...}
|
||||
:return: None
|
||||
"""
|
||||
current_mlx_config = self.get_device_conf_dict()
|
||||
self._build_config_param_metadata_map(current_mlx_config)
|
||||
current_mlx_config = self._inflate_array_param_vals_from_query(
|
||||
current_mlx_config)
|
||||
# inflate user input for array parameters
|
||||
conf_dict = self._inflate_array_param_vals_from_input(conf_dict)
|
||||
|
||||
params_to_set = []
|
||||
for key, value in conf_dict.items():
|
||||
if not self.param_supp_by_config_tool(key):
|
||||
LOG.error(
|
||||
"Configuraiton: %s is not supported by mstconfig,"
|
||||
" please update to the latest mstflint package." % key)
|
||||
continue
|
||||
|
||||
if current_mlx_config.get(key) and value.lower(
|
||||
) not in current_mlx_config.get(key).lower():
|
||||
# Aggregate all configurations required to be modified
|
||||
params_to_set.append("%s=%s" % (key, value))
|
||||
|
||||
if params_to_set:
|
||||
LOG.info("Setting configurations for device: %s" % self.pci_dev)
|
||||
run_command("mstconfig", "-d", self.pci_dev, "-y",
|
||||
"set", *params_to_set)
|
||||
LOG.info("Set device configurations: Setting %s done successfully",
|
||||
" ".join(params_to_set))
|
||||
else:
|
||||
LOG.info("Set device configurations: No operation required")
|
||||
|
||||
|
||||
class MlnxFirmwareBinary(object):
|
||||
|
||||
def __init__(self, local_bin_path):
|
||||
self.bin_path = local_bin_path
|
||||
self.image_info = {}
|
||||
|
||||
def get_info(self):
|
||||
""" Get firmware information from binary
|
||||
|
||||
Caller should wrap this call under try catch to skip non compliant
|
||||
firmware binaries.
|
||||
|
||||
:return: dict of firmware image attributes
|
||||
"""
|
||||
if self.image_info.get('file_path', '') == self.bin_path:
|
||||
return self.image_info
|
||||
self.image_info = {'file_path': self.bin_path}
|
||||
cmd = ['mstflint', '-i', self.bin_path, 'query']
|
||||
out = run_command(*cmd)
|
||||
self.image_info.update(parse_mstflint_query_output(out))
|
||||
# Note(adrianc): deep copy ?
|
||||
return self.image_info
|
||||
|
||||
|
||||
class MlnxFirmwareBinariesFetcher(object):
|
||||
""" A class for fetching firmware binaries form a directory
|
||||
provided by a URL link
|
||||
|
||||
Note: URL MUST point to a directory and end with '/'
|
||||
e.g http://www.mysite.com/mlnx_bins/
|
||||
"""
|
||||
dest_dir = tempfile.mkdtemp(suffix="tripleo_mlnx_firmware")
|
||||
|
||||
class FileHTMLParser(html_parser.HTMLParser):
|
||||
""" A crude HTML Parser to extract files from an HTTP response.
|
||||
"""
|
||||
|
||||
def __init__(self, suffix):
|
||||
# HTMLParser is Old style class dont use super() method
|
||||
html_parser.HTMLParser.__init__(self)
|
||||
self.matches = []
|
||||
self.suffix = suffix
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
for name, val in attrs:
|
||||
if name == 'href' and val.endswith(self.suffix):
|
||||
self.matches.append(val)
|
||||
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
|
||||
def __del__(self):
|
||||
self._cleanup_dest_dir()
|
||||
|
||||
def _cleanup_dest_dir(self):
|
||||
if os.path.exists(MlnxFirmwareBinariesFetcher.dest_dir):
|
||||
shutil.rmtree(MlnxFirmwareBinariesFetcher.dest_dir)
|
||||
|
||||
def _get_file_from_url(self, file_name):
|
||||
try:
|
||||
full_path = self.url + "/" + file_name
|
||||
LOG.info("Downloading file: %s to %s", full_path,
|
||||
MlnxFirmwareBinariesFetcher.dest_dir)
|
||||
url_data = urlRequest.urlopen(full_path)
|
||||
except urlError.HTTPError as e:
|
||||
LOG.error("Failed to download data: %s", str(e))
|
||||
raise e
|
||||
dest_file_path = os.path.join(MlnxFirmwareBinariesFetcher.dest_dir,
|
||||
file_name)
|
||||
with open(dest_file_path, 'wb') as f:
|
||||
f.write(url_data.read())
|
||||
return dest_file_path
|
||||
|
||||
def _get_file_create_bin_obj(self, file_name, fw_bins):
|
||||
""" This wrapper method will download a firmware binary,
|
||||
create MlnxFirmwareBinary object and append to the provided
|
||||
fw_bins list.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
try:
|
||||
dest_file_path = self._get_file_from_url(file_name)
|
||||
fw_bin = MlnxFirmwareBinary(dest_file_path)
|
||||
# Note(adrianc): Pre query image, to skip incompatible files
|
||||
# in case of Error
|
||||
fw_bin.get_info()
|
||||
fw_bins.append(fw_bin)
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to download and query %s, skipping file. "
|
||||
"%s", file_name, str(e))
|
||||
|
||||
def get_firmware_binaries(self):
|
||||
""" Get Firmware binaries
|
||||
|
||||
:return: list containing the files downloaded
|
||||
"""
|
||||
# get list of files
|
||||
# download into dest_dir
|
||||
# for each file, create MlnxFirmwareBinary
|
||||
# return list of the MlnxFirmwareBinary
|
||||
if not self.url.endswith('/'):
|
||||
LOG.error("Bad URL provided (%s), expected URL to be a directory",
|
||||
self.url)
|
||||
raise RuntimeError('Failed to get firmware binaries, '
|
||||
'expected directory URL path '
|
||||
'(e.g "http://<your_ip>/mlnx_bins/"). '
|
||||
'Given URL path: %s', self.url)
|
||||
try:
|
||||
index_data = str(urlRequest.urlopen(_BIN_DIR_URL).read())
|
||||
except urlError.HTTPError as err:
|
||||
LOG.error(err)
|
||||
raise err
|
||||
parser = MlnxFirmwareBinariesFetcher.FileHTMLParser(suffix=".bin")
|
||||
parser.feed(index_data)
|
||||
parser.close()
|
||||
if not parser.matches:
|
||||
LOG.warning("No bin Files found in the provided URL: %s", self.url)
|
||||
|
||||
fw_bins = []
|
||||
threads = []
|
||||
for file_name in parser.matches:
|
||||
# TODO(adrianc) fetch files async with co-routines,
|
||||
# may need to limit thread count
|
||||
t = threading.Thread(target=self._get_file_create_bin_obj,
|
||||
args=(file_name, fw_bins))
|
||||
t.start()
|
||||
threads.append(t)
|
||||
for t in threads:
|
||||
t.join()
|
||||
return fw_bins
|
||||
|
||||
|
||||
class MlnxDevFirmwareOps(object):
|
||||
""" Perform various Firmware related operations on device
|
||||
"""
|
||||
|
||||
def __init__(self, dev):
|
||||
self.dev = dev
|
||||
self.dev_info = {}
|
||||
|
||||
def query_device(self, force=False):
|
||||
""" Get firmware information from device
|
||||
|
||||
:param force: force device query, even if query was executed in
|
||||
previous calls.
|
||||
:return: dict of firmware image attributes
|
||||
"""
|
||||
if not force and self.dev_info.get('device', '') == self.dev:
|
||||
return self.dev_info
|
||||
|
||||
self.dev_info = {'device': self.dev}
|
||||
cmd = ['mstflint', '-d', self.dev, '-qq', 'query']
|
||||
out = run_command(*cmd)
|
||||
self.dev_info = parse_mstflint_query_output(out)
|
||||
# Note(adrianc): deep copy ?
|
||||
return self.dev_info
|
||||
|
||||
def need_update(self, image_info):
|
||||
""" Check if device requires firmware update
|
||||
|
||||
:param image_info: image_info dict as returned from
|
||||
MlnxFirmwareBinary.get_info()
|
||||
:return: bool, True if update is needed
|
||||
"""
|
||||
if not self.dev_info:
|
||||
self.query_device()
|
||||
LOG.info("Device firmware version: %s, Image firmware version: %s" %
|
||||
(self.dev_info['fw_ver'], image_info['fw_ver']))
|
||||
return self.dev_info['fw_ver'] < image_info['fw_ver']
|
||||
|
||||
def need_reset_before_config(self):
|
||||
""" Check if device requires firmware reset before applying any
|
||||
configurations on the device.
|
||||
|
||||
:return: (bool, bool) True if reset is needed,
|
||||
True if skip_fms_sync is needed
|
||||
"""
|
||||
self.query_device(force=True)
|
||||
next_boot_image_newer = 'running_fw_ver' in self.dev_info and \
|
||||
self.dev_info['running_fw_ver'] < self.dev_info['fw_ver']
|
||||
if next_boot_image_newer:
|
||||
mandatory_params = ["ESWITCH_IPV4_TTL_MODIFY_ENABLE",
|
||||
"PRIO_TAG_REQUIRED_EN",
|
||||
"INTERNAL_CPU_PAGE_SUPPLIE",
|
||||
"INTERNAL_CPU_ESWITCH_MANAGE",
|
||||
"INTERNAL_CPU_IB_VPORT0",
|
||||
"INTERNAL_CPU_OFFLOAD_ENGINE"]
|
||||
device_config = MlnxDeviceConfig(self.dev)
|
||||
conf_dict = device_config.get_device_conf_dict()
|
||||
for param in mandatory_params:
|
||||
if param not in conf_dict and \
|
||||
device_config.param_supp_by_config_tool(param):
|
||||
if "INTERNAL_CPU_MODEL" in conf_dict:
|
||||
if self.dev_info['running_fw_ver'] < "24.32.0000":
|
||||
# In case the device is BlueField and the FW is less than Nov release
|
||||
# return True, True to do reset with skip_fms_sync
|
||||
return True, True
|
||||
elif (self.dev_info['running_fw_ver'] >= "24.32.0000" and
|
||||
self.dev_info['running_fw_ver'] < "24.33.0000"):
|
||||
# In case the device is BlueField and the FW is from Nov release
|
||||
# reset will fail, so don't do it
|
||||
return False, False
|
||||
return True, False
|
||||
return False, False
|
||||
|
||||
def burn_firmware(self, image_path):
|
||||
""" Burn firmware on device
|
||||
|
||||
:param image_path: firmware binary file path
|
||||
:return: None
|
||||
"""
|
||||
LOG.info("Updating firmware image (%s) for device: %s",
|
||||
image_path, self.dev)
|
||||
cmd = ["mstflint", "-d", self.dev, "-i", image_path,
|
||||
"-y", "burn"]
|
||||
run_command(*cmd)
|
||||
LOG.info("Device %s: Successfully updated.", self.dev)
|
||||
|
||||
def reset_device(self, skip_fms_sync=False):
|
||||
""" Reset firmware
|
||||
|
||||
:return: None
|
||||
"""
|
||||
LOG.info("Device %s: Performing firmware reset.", self.dev)
|
||||
if skip_fms_sync:
|
||||
cmd = ["mstfwreset", "-d", self.dev, "--skip_fsm_sync", "-y",
|
||||
"--sync", _RESET_SYNC, "reset"]
|
||||
else:
|
||||
cmd = ["mstfwreset", "-d", self.dev, "-y", "reset"]
|
||||
run_command(*cmd)
|
||||
LOG.info("Device %s: Firmware successfully reset.", self.dev)
|
||||
|
||||
|
||||
def check_prereq():
|
||||
""" Check that all needed tools are available in the system.
|
||||
|
||||
:return: None
|
||||
"""
|
||||
try:
|
||||
# check for mstflint
|
||||
run_command('mstflint', '-v')
|
||||
# check for mstconfig
|
||||
run_command('mstconfig', '-v')
|
||||
# check for mstfwreset
|
||||
run_command('mstfwreset', '-v')
|
||||
# check for lspci
|
||||
run_command('lspci', '--version')
|
||||
except Exception as e:
|
||||
LOG.error("Failed Prerequisite check. %s", str(e))
|
||||
raise e
|
||||
|
||||
|
||||
def process_device(pci_dev, psid_map):
|
||||
""" Process a single Mellanox device.
|
||||
|
||||
Processing pipeline:
|
||||
- Perform firmware update if required
|
||||
- Reset device to load firmware if required
|
||||
- Perform device configurations if required
|
||||
|
||||
:param pci_dev: nvidia nic PCI device address (String)
|
||||
:param psid_map: dict mapping between PSID and an image_info dict
|
||||
:return: None
|
||||
"""
|
||||
try:
|
||||
LOG.info("Processing Device: %s", pci_dev)
|
||||
dev_ops = MlnxDevFirmwareOps(pci_dev)
|
||||
device_config = MlnxDeviceConfig(pci_dev)
|
||||
dev_query = dev_ops.query_device()
|
||||
# see if there is a matching bin
|
||||
dev_psid = dev_query['psid']
|
||||
if dev_psid in psid_map:
|
||||
if _FORCE_UPDATE or dev_ops.need_update(psid_map[dev_psid]):
|
||||
dev_ops.burn_firmware(psid_map[dev_psid]['file_path'])
|
||||
else:
|
||||
LOG.info("Firmware update is not required for Device.")
|
||||
else:
|
||||
LOG.info("No firmware binary found for device %s with "
|
||||
"PSID: %s, skipping...", pci_dev, dev_psid)
|
||||
# check if reset is required.
|
||||
# Note: device Reset is required if a newer firmware version was burnt
|
||||
# and current firmware does not support some mandatory configurations.
|
||||
# Note: skip_fms_sync is required if device is BlueField SmartNIC and
|
||||
# the current FW is less than Nov release
|
||||
need_rest, need_skip_fms_sync = dev_ops.need_reset_before_config()
|
||||
if need_rest:
|
||||
dev_ops.reset_device(skip_fms_sync=need_skip_fms_sync)
|
||||
# set device configurations
|
||||
device_config.set_config(_MLX_CONFIG)
|
||||
LOG.info("Device %s processed successfully.", pci_dev)
|
||||
except Exception as e:
|
||||
LOG.error("Failed to process device %s. %s", pci_dev, str(e))
|
||||
|
||||
|
||||
def main():
|
||||
check_prereq()
|
||||
# discover devices
|
||||
mlnx_devices = MlnxDevices(_DEV_WHITE_LIST)
|
||||
mlnx_devices.discover()
|
||||
# get binaries and prep psid map
|
||||
psid_map = {}
|
||||
if _BIN_DIR_URL:
|
||||
binary_getter = MlnxFirmwareBinariesFetcher(_BIN_DIR_URL)
|
||||
fw_binaries = binary_getter.get_firmware_binaries()
|
||||
|
||||
for fw_bin in fw_binaries:
|
||||
image_info = fw_bin.get_info()
|
||||
psid_map[image_info['psid']] = image_info
|
||||
# process devices
|
||||
for pci_dev in mlnx_devices:
|
||||
process_device(pci_dev, psid_map)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
params:
|
||||
$BIN_DIR_URL: {get_param: BIN_DIR_URL}
|
||||
$FORCE_UPDATE: {get_param: FORCE_UPDATE}
|
||||
$DEV_WHITE_LIST: {get_param: DEV_WHITE_LIST}
|
||||
$NUM_OF_VFS: {get_param: NUM_OF_VFS}
|
||||
$SRIOV_EN: {get_param: SRIOV_EN}
|
||||
$LINK_TYPE: {get_param: LINK_TYPE}
|
||||
$ESWITCH_IPV4_TTL_MODIFY_ENABLE: {get_param: ESWITCH_IPV4_TTL_MODIFY_ENABLE}
|
||||
$PRIO_TAG_REQUIRED_EN: {get_param: PRIO_TAG_REQUIRED_EN}
|
||||
$ESWITCH_HAIRPIN_TOT_BUFFER_SIZE: {get_param: ESWITCH_HAIRPIN_TOT_BUFFER_SIZE}
|
||||
$ESWITCH_HAIRPIN_DESCRIPTORS: {get_param: ESWITCH_HAIRPIN_DESCRIPTORS}
|
||||
$BF_NIC_MODE: {get_param: BF_NIC_MODE}
|
||||
$RESET_SYNC: {get_param: RESET_SYNC}
|
||||
|
||||
|
||||
outputs:
|
||||
# This means get_resource from the parent template will get the userdata, see:
|
||||
# http://docs.openstack.org/developer/heat/template_guide/composition.html#making-your-template-resource-more-transparent
|
||||
# Note this is new-for-kilo, an alternative is returning a value then using
|
||||
# get_attr in the parent template instead.
|
||||
OS::stack_id:
|
||||
value: {get_resource: userdata}
|
Loading…
Reference in New Issue
Block a user