os-net-config/os_net_config/sriov_config.py

698 lines
25 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2014 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
# The sriov_config.py module does the SR-IOV PF configuration.
# It'll be invoked by the sriov_config systemd service for the persistence of
# the SR-IOV configuration across reboots. And os-net-config:utils also invokes
# it for the first time configuration.
# An entry point os-net-config-sriov is added for invocation of this module.
import argparse
import os
import pyudev
import queue
import re
import sys
import time
import yaml
from os_net_config import common
from os_net_config import sriov_bind_config
from oslo_concurrency import processutils
logger = common.configure_logger()
_UDEV_RULE_FILE = '/etc/udev/rules.d/80-persistent-os-net-config.rules'
_UDEV_LEGACY_RULE_FILE = '/etc/udev/rules.d/70-os-net-config-sriov.rules'
_IFUP_LOCAL_FILE = '/sbin/ifup-local'
_RESET_SRIOV_RULES_FILE = '/etc/udev/rules.d/70-tripleo-reset-sriov.rules'
_ALLOCATE_VFS_FILE = '/etc/sysconfig/allocate_vfs'
_MLNX_DRIVER = "mlx5_core"
MLNX_UNBIND_FILE_PATH = "/sys/bus/pci/drivers/mlx5_core/unbind"
MAX_RETRIES = 10
PF_FUNC_RE = re.compile(r"\.(\d+)$", 0)
# In order to keep VF representor name consistent specially after the upgrade
# proccess, we should have a udev rule to handle that.
# The udev rule will rename the VF representor as "<sriov_pf_name>_<vf_num>"
_REP_LINK_NAME_FILE = "/etc/udev/rep-link-name.sh"
_REP_LINK_NAME_DATA = '''#!/bin/bash
# This file is autogenerated by os-net-config
set -x
PORT="$1"
echo "NUMBER=${PORT##pf*vf}"
'''
# Create a queue for passing the udev network events
vf_queue = queue.Queue()
# File to contain the list of SR-IOV PF, VF and their configurations
# Format of the file shall be
# - device_type: pf
# name: <pf name>
# numvfs: <number of VFs>
# promisc: "on"/"off"
# - device_type: vf
# device:
# name: <pf name>
# vfid: <VF id>
# name: <vf name>
# vlan_id: <vlan>
# qos: <qos>
# spoofcheck: "on"/"off"
# trust: "on"/"off"
# state: "auto"/"enable"/"disable"
# macaddr: <mac address>
# promisc: "on"/"off"
_SRIOV_CONFIG_FILE = '/var/lib/os-net-config/sriov_config.yaml'
class SRIOVNumvfsException(ValueError):
pass
def udev_event_handler(action, device):
event = {"action": action, "device": device.sys_path}
logger.info(
f"Received udev event {event['action']} for {event['device']}"
)
vf_queue.put(event)
def get_file_data(filename):
if not os.path.exists(filename):
return ''
try:
with open(filename, 'r') as f:
return f.read()
except IOError:
logger.error(f"Error reading file: {filename}")
return ''
def _get_sriov_map():
contents = get_file_data(_SRIOV_CONFIG_FILE)
sriov_map = yaml.safe_load(contents) if contents else []
return sriov_map
def _wait_for_vf_creation(pf_name, numvfs):
vf_count = 0
vf_list = []
while vf_count < numvfs:
try:
# wait for 5 seconds after every udev event
event = vf_queue.get(True, 5)
vf_name = os.path.basename(event["device"])
pf_path = os.path.normpath(os.path.join(event["device"],
"../../physfn/net"))
if os.path.isdir(pf_path):
pf_nic = os.listdir(pf_path)
if len(pf_nic) == 1 and pf_name == pf_nic[0]:
if vf_name not in vf_list:
vf_list.append(vf_name)
logger.info(
f"VF: {vf_name} created for PF: {pf_name}"
)
vf_count = vf_count + 1
else:
logger.warning(f"Unable to parse event {event['device']}")
else:
logger.warning(f"{pf_path} is not a directory")
except queue.Empty:
logger.info(f"Timeout in the creation of VFs for PF {pf_name}")
return
logger.info(f"Required VFs are created for PF {pf_name}")
def get_numvfs(ifname):
"""Getting sriov_numvfs for PF
Wrapper that will get the sriov_numvfs file for a PF.
:param ifname: interface name (ie: p1p1)
:returns: int -- the number of current VFs on ifname
:raises: SRIOVNumvfsException
"""
sriov_numvfs_path = common.get_dev_path(ifname, "sriov_numvfs")
logger.debug(f"Getting numvfs for interface {ifname}")
try:
with open(sriov_numvfs_path, 'r') as f:
curr_numvfs = int(f.read())
except IOError as exc:
msg = f"Unable to read numvfs for {ifname}: {exc}"
raise SRIOVNumvfsException(msg)
logger.debug(f"Interface {ifname} has {curr_numvfs} configured")
return curr_numvfs
def set_numvfs(ifname, numvfs):
"""Setting sriov_numvfs for PF
Wrapper that will set the sriov_numvfs file for a PF.
After numvfs has been set for an interface, _wait_for_vf_creation will be
called to monitor the creation.
Some restrictions:
- if current number of VF is already defined, we can't change numvfs
- if sriov_numvfs doesn't exist for an interface, we can't create it
:param ifname: interface name (ie: p1p1)
:param numvfs: an int that represents the number of VFs to be created.
:returns: int -- the number of current VFs on ifname
:raises: SRIOVNumvfsException
"""
curr_numvfs = get_numvfs(ifname)
logger.debug(f"Interface {ifname} has {curr_numvfs} configured, setting "
f"to {numvfs}")
if not isinstance(numvfs, int):
msg = (f"Unable to configure pf: {ifname} with numvfs: {numvfs}\n"
f"numvfs must be an integer")
raise SRIOVNumvfsException(msg)
if numvfs != curr_numvfs:
if curr_numvfs != 0:
logger.warning(f"Numvfs already configured to {curr_numvfs} for "
f"{ifname}")
return curr_numvfs
sriov_numvfs_path = common.get_dev_path(ifname, "sriov_numvfs")
try:
logger.debug(f"Setting {sriov_numvfs_path} to {numvfs}")
with open(sriov_numvfs_path, "w") as f:
f.write("%d" % numvfs)
except IOError as exc:
msg = (f"Unable to configure pf: {ifname} with numvfs: {numvfs}\n"
f"{exc}")
raise SRIOVNumvfsException(msg)
_wait_for_vf_creation(ifname, numvfs)
curr_numvfs = get_numvfs(ifname)
if curr_numvfs != numvfs:
msg = (f"Unable to configure pf: {ifname} with numvfs: {numvfs}\n"
"sriov_numvfs file is not set to the targeted number of "
"vfs")
raise SRIOVNumvfsException(msg)
return curr_numvfs
def restart_ovs_and_pfs_netdevs():
sriov_map = _get_sriov_map()
processutils.execute('/usr/bin/systemctl', 'restart', 'openvswitch')
for item in sriov_map:
if item['device_type'] == 'pf':
if_down_interface(item['name'])
if_up_interface(item['name'])
def cleanup_puppet_config():
file_contents = ""
if os.path.exists(_RESET_SRIOV_RULES_FILE):
os.remove(_RESET_SRIOV_RULES_FILE)
if os.path.exists(_ALLOCATE_VFS_FILE):
os.remove(_ALLOCATE_VFS_FILE)
if os.path.exists(_IFUP_LOCAL_FILE):
# Remove the invocation of allocate_vfs script generated by puppet
# After the removal of allocate_vfs, if the ifup-local file has just
# "#!/bin/bash" left, then remove the file as well.
with open(_IFUP_LOCAL_FILE) as oldfile:
for line in oldfile:
if "/etc/sysconfig/allocate_vfs" not in line:
file_contents = file_contents + line
if file_contents.strip() == "#!/bin/bash":
os.remove(_IFUP_LOCAL_FILE)
else:
with open(_IFUP_LOCAL_FILE, 'w') as newfile:
newfile.write(file_contents)
def udev_monitor_setup():
# Create a context for pyudev and observe udev events for network
context = pyudev.Context()
monitor = pyudev.Monitor.from_netlink(context)
monitor.filter_by('net')
observer = pyudev.MonitorObserver(monitor, udev_event_handler)
return observer
def udev_monitor_start(observer):
observer.start()
def udev_monitor_stop(observer):
observer.stop()
def is_partitioned_pf(dev_name: str) -> bool:
"""Check if any nic-partition(VF) is already used
Given a PF device, returns True if any VFs of this
device are in-use.
"""
sriov_map = _get_sriov_map()
for config in sriov_map:
devtype = config.get('device_type', None)
if devtype == 'vf':
name = config.get('device', {}).get('name')
vf_name = config.get('name')
if dev_name == name:
logger.warning("%s has VF(%s) used by host" % (name, vf_name))
return True
return False
def configure_sriov_pf(execution_from_cli=False, restart_openvswitch=False):
observer = udev_monitor_setup()
udev_monitor_start(observer)
sriov_map = _get_sriov_map()
mlnx_vfs_pcis_list = []
trigger_udev_rule = False
# Cleanup the previous config by puppet-tripleo
cleanup_puppet_config()
for item in sriov_map:
if item['device_type'] == 'pf':
_pf_interface_up(item)
if item.get('link_mode') == "legacy":
# Add a udev rule to configure the VF's when PF's are
# released by a guest
if not is_partitioned_pf(item['name']):
add_udev_rule_for_legacy_sriov_pf(item['name'],
item['numvfs'])
# When configuring vdpa, we need to configure switchdev before
# we create the VFs
is_mlnx = common.is_mellanox_interface(item['name'])
# Configure switchdev mode when vdpa
if item.get('vdpa') and is_mlnx:
configure_switchdev(item['name'])
set_numvfs(item['name'], item['numvfs'])
# Configure switchdev mode when not vdpa
if (item.get('link_mode') == "switchdev" and is_mlnx and
not item.get('vdpa')):
logger.info(f"{item['name']}: Mellanox card")
vf_pcis_list = get_vf_pcis_list(item['name'])
mlnx_vfs_pcis_list += vf_pcis_list
for vf_pci in vf_pcis_list:
vf_pci_path = f"/sys/bus/pci/devices/{vf_pci}/driver"
if os.path.exists(vf_pci_path):
logger.info(f"Unbinding {vf_pci}")
with open(MLNX_UNBIND_FILE_PATH, 'w') as f:
f.write(vf_pci)
logger.info(f"{item['name']}: Adding udev rules")
# Adding a udev rule to make vf-representors unmanaged by
# NetworkManager
add_udev_rule_to_unmanage_vf_representors_by_nm()
# Adding a udev rule to save the sriov_pf name
trigger_udev_rule = add_udev_rule_for_sriov_pf(item['name'])\
or trigger_udev_rule
configure_smfs_software_steering(item['name'])
# Configure switchdev mode
configure_switchdev(item['name'])
# Adding a udev rule to rename vf-representors
trigger_udev_rule = add_udev_rule_for_vf_representors(
item['name']) or trigger_udev_rule
# Moving the sriov-PFs to switchdev mode will put the netdev
# interfaces in down state.
# In case we are running during initial deployment,
# bring the interfaces up.
# In case we are running as part of the sriov_config service
# after reboot, net config scripts, which run after
# sriov_config service will bring the interfaces up.
if execution_from_cli:
if_up_interface(item['name'])
if mlnx_vfs_pcis_list:
sriov_bind_pcis_map = {_MLNX_DRIVER: mlnx_vfs_pcis_list}
if not execution_from_cli:
sriov_bind_config.update_sriov_bind_pcis_map(sriov_bind_pcis_map)
else:
sriov_bind_config.configure_sriov_bind_service()
sriov_bind_config.bind_vfs(sriov_bind_pcis_map)
# Trigger udev rules if there is new rules written
if trigger_udev_rule:
trigger_udev_rules()
udev_monitor_stop(observer)
if restart_openvswitch:
restart_ovs_and_pfs_netdevs()
def _wait_for_uplink_rep_creation(pf_name):
uplink_rep_phys_switch_id_path = f"/sys/class/net/{pf_name}/phys_switch_id"
for i in range(MAX_RETRIES):
if get_file_data(uplink_rep_phys_switch_id_path):
logger.info(f"Uplink representor {pf_name} ready")
break
time.sleep(1)
else:
raise RuntimeError(f"Timeout while waiting for uplink representor "
f"{pf_name}.")
def create_rep_link_name_script():
with open(_REP_LINK_NAME_FILE, "w") as f:
f.write(_REP_LINK_NAME_DATA)
# Make the _REP_LINK_NAME_FILE executable
os.chmod(_REP_LINK_NAME_FILE, 0o755)
def add_udev_rule_for_sriov_pf(pf_name):
logger.info(f"adding udev rules for sriov {pf_name}")
pf_pci = get_pf_pci(pf_name)
udev_data_line = 'SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?*", '\
f'KERNELS=="{pf_pci}", NAME="{pf_name}"'
return add_udev_rule(udev_data_line, _UDEV_RULE_FILE)
def add_udev_rule_for_legacy_sriov_pf(pf_name, numvfs):
logger.info(f"adding udev rules for legacy sriov {pf_name}: {numvfs}")
udev_line = f'KERNEL=="{pf_name}", '\
f'RUN+="/bin/os-net-config-sriov -n %k:{numvfs}"'
pattern = f'KERNEL=="{pf_name}", RUN+="/bin/os-net-config-sriov -n'
return add_udev_rule(udev_line, _UDEV_LEGACY_RULE_FILE, pattern)
def add_udev_rule_for_vf_representors(pf_name):
logger.info(f"adding udev rules for vf representators {pf_name}")
phys_switch_id_path = os.path.join(common.SYS_CLASS_NET, pf_name,
"phys_switch_id")
phys_switch_id = get_file_data(phys_switch_id_path).strip()
pf_pci = get_pf_pci(pf_name)
pf_fun_num_match = PF_FUNC_RE.search(pf_pci)
if pf_fun_num_match:
pf_fun_num = pf_fun_num_match.group(1)
else:
logger.error(f"Failed to get function number for {pf_name} \n"
"and so failed to create a udev rule for renaming "
"its' vf-represent")
return
udev_data_line = 'SUBSYSTEM=="net", ACTION=="add", ATTR{phys_switch_id}'\
'=="%s", ATTR{phys_port_name}=="pf%svf*", '\
'IMPORT{program}="%s $attr{phys_port_name}", '\
'NAME="%s_$env{NUMBER}"' % (phys_switch_id,
pf_fun_num,
_REP_LINK_NAME_FILE,
pf_name)
create_rep_link_name_script()
return add_udev_rule(udev_data_line, _UDEV_RULE_FILE)
def add_udev_rule_to_unmanage_vf_representors_by_nm():
logger.info(f"adding udev rules to unmanage vf representators")
udev_data_line = 'SUBSYSTEM=="net", ACTION=="add", ATTR{phys_switch_id}'\
'!="", ATTR{phys_port_name}=="pf*vf*", '\
'ENV{NM_UNMANAGED}="1"'
return add_udev_rule(udev_data_line, _UDEV_RULE_FILE)
def add_udev_rule(udev_data, udev_file, pattern=None):
logger.debug(f"adding udev rule to {udev_file}: {udev_data}")
trigger_udev_rule = False
udev_data = udev_data.strip()
if not pattern:
pattern = udev_data
if not os.path.exists(udev_file):
with open(udev_file, "w") as f:
data = "# This file is autogenerated by os-net-config\n"\
f"{udev_data}\n"
f.write(data)
else:
file_data = get_file_data(udev_file)
udev_lines = file_data.splitlines()
if pattern in file_data:
if udev_data in udev_lines:
return trigger_udev_rule
with open(udev_file, "w") as f:
for line in udev_lines:
if pattern in line:
f.write(udev_data + "\n")
else:
f.write(line + "\n")
else:
with open(udev_file, "a") as f:
f.write(udev_data + "\n")
reload_udev_rules()
trigger_udev_rule = True
return trigger_udev_rule
def reload_udev_rules():
try:
processutils.execute('/usr/sbin/udevadm', 'control', '--reload-rules')
logger.info("udev rules reloaded successfully")
except processutils.ProcessExecutionError as exc:
logger.error(f"Failed to reload udev rules: {exc}")
raise
def trigger_udev_rules():
try:
processutils.execute('/usr/sbin/udevadm', 'trigger', '--action=add',
'--attr-match=subsystem=net')
logger.info("udev rules triggered successfully")
except processutils.ProcessExecutionError as exc:
logger.error(f"Failed to trigger udev rules: {exc}")
raise
def configure_switchdev(pf_name):
pf_pci = get_pf_pci(pf_name)
pf_device_id = get_pf_device_id(pf_name)
if pf_device_id == "0x1013" or pf_device_id == "0x1015":
try:
processutils.execute('/usr/sbin/devlink', 'dev', 'eswitch', 'set',
f'pci/{pf_pci}', 'inline-mode', 'transport')
except processutils.ProcessExecutionError as exc:
logger.error(f"Failed to set inline-mode to transport for "
f"{pf_pci}: {exc}")
raise
try:
processutils.execute('/usr/sbin/devlink', 'dev', 'eswitch', 'set',
f'pci/{pf_pci}', 'mode', 'switchdev')
except processutils.ProcessExecutionError as exc:
logger.error(f"Failed to set mode to switchdev for {pf_pci}: {exc}")
raise
logger.info(f"Device pci/{pf_pci} set to switchdev mode.")
# WA to make sure that the uplink_rep is ready after moving to switchdev,
# as moving to switchdev will remove the sriov_pf and create uplink
# representor, so we need to make sure that uplink representor is ready
# before proceed
_wait_for_uplink_rep_creation(pf_name)
try:
processutils.execute('/usr/sbin/ethtool', '-K', pf_name,
'hw-tc-offload', 'on')
logger.info(f"Enabled \"hw-tc-offload\" for PF {pf_name}.")
except processutils.ProcessExecutionError as exc:
logger.error(f"Failed to enable hw-tc-offload on {pf_name}: {exc}")
raise
def configure_smfs_software_steering(pf_name):
pf_pci = get_pf_pci(pf_name)
try:
processutils.execute('/usr/sbin/devlink', 'dev', 'param', 'set',
f'pci/{pf_pci}', 'name', 'flow_steering_mode',
'value', 'smfs', 'cmode', 'runtime')
logger.info(f"Device pci/{pf_pci} is set to smfs steering mode.")
except processutils.ProcessExecutionError as exc:
logger.warning(f"Could not set pci/{pf_pci} to smfs steering mode: "
f"{exc}")
def run_ip_config_cmd(*cmd, **kwargs):
logger.info("Running %s" % ' '.join(cmd))
try:
processutils.execute(*cmd, delay_on_retry=True, attempts=10, **kwargs)
except processutils.ProcessExecutionError as exc:
logger.error("Failed to execute %s: %s" % (' '.join(cmd), exc))
raise
def _pf_interface_up(pf_device):
if 'promisc' in pf_device:
run_ip_config_cmd('ip', 'link', 'set', 'dev', pf_device['name'],
'promisc', pf_device['promisc'])
logger.info(f"Bringing up PF: {pf_device['name']}")
run_ip_config_cmd('ip', 'link', 'set', 'dev', pf_device['name'], 'up')
def run_ip_config_cmd_safe(raise_error, *cmd, **kwargs):
try:
run_ip_config_cmd(*cmd)
except processutils.ProcessExecutionError:
if raise_error:
raise
def get_pf_pci(pf_name):
pf_pci_path = common.get_dev_path(pf_name, "uevent")
pf_info = get_file_data(pf_pci_path)
pf_pci = re.search(r'PCI_SLOT_NAME=(.*)', pf_info, re.MULTILINE).group(1)
return pf_pci
def get_pf_device_id(pf_name):
pf_device_path = common.get_dev_path(pf_name, "device")
pf_device_id = get_file_data(pf_device_path).strip()
return pf_device_id
def get_vf_pcis_list(pf_name):
vf_pcis_list = []
listOfPfFiles = os.listdir(os.path.join(common.SYS_CLASS_NET, pf_name,
"device"))
for pf_file in listOfPfFiles:
if pf_file.startswith("virtfn"):
vf_info = get_file_data(common.get_dev_path(pf_name,
f"{pf_file}/uevent"))
vf_pcis_list.append(re.search(r'PCI_SLOT_NAME=(.*)',
vf_info, re.MULTILINE).group(1))
return vf_pcis_list
def if_down_interface(device):
logger.info(f"Running /sbin/ifdown {device}")
try:
processutils.execute('/sbin/ifdown', device)
except processutils.ProcessExecutionError:
logger.error(f"Failed to ifdown {device}")
raise
def if_up_interface(device):
logger.info(f"Running /sbin/ifup {device}")
try:
processutils.execute('/sbin/ifup', device)
except processutils.ProcessExecutionError:
logger.error(f"Failed to ifup {device}")
raise
def configure_sriov_vf():
sriov_map = _get_sriov_map()
for item in sriov_map:
raise_error = True
if item['device_type'] == 'vf':
pf_name = item['device']['name']
vfid = item['device']['vfid']
base_cmd = ('ip', 'link', 'set', 'dev', pf_name, 'vf', str(vfid))
logger.info(f"Configuring settings for PF: {pf_name} VF: {vfid} "
f"VF name: {item['name']}")
raise_error = True
if 'macaddr' in item:
cmd = base_cmd + ('mac', item['macaddr'])
run_ip_config_cmd(*cmd)
if 'vlan_id' in item:
vlan_cmd = base_cmd + ('vlan', str(item['vlan_id']))
if 'qos' in item:
vlan_cmd = vlan_cmd + ('qos', str(item['qos']))
run_ip_config_cmd(*vlan_cmd)
if 'max_tx_rate' in item:
cmd = base_cmd + ('max_tx_rate', str(item['max_tx_rate']))
if item['max_tx_rate'] == 0:
raise_error = False
run_ip_config_cmd_safe(raise_error, *cmd)
if 'min_tx_rate' in item:
cmd = base_cmd + ('min_tx_rate', str(item['min_tx_rate']))
if item['min_tx_rate'] == 0:
raise_error = False
run_ip_config_cmd_safe(raise_error, *cmd)
if 'spoofcheck' in item:
cmd = base_cmd + ('spoofchk', item['spoofcheck'])
run_ip_config_cmd(*cmd)
if 'state' in item:
cmd = base_cmd + ('state', item['state'])
run_ip_config_cmd(*cmd)
if 'trust' in item:
cmd = base_cmd + ('trust', item['trust'])
run_ip_config_cmd(*cmd)
if 'promisc' in item:
run_ip_config_cmd('ip', 'link', 'set', 'dev', item['name'],
'promisc', item['promisc'])
def parse_opts(argv):
parser = argparse.ArgumentParser(
description='Configure SR-IOV PF and VF interfaces using a YAML'
' config file format.')
parser.add_argument(
'-d', '--debug',
dest="debug",
action='store_true',
help="Print debugging output.",
required=False)
parser.add_argument(
'-v', '--verbose',
dest="verbose",
action='store_true',
help="Print verbose output.",
required=False)
parser.add_argument(
'-n', '--numvfs',
dest="numvfs",
action='store',
help="Provide the numvfs for device in the format <device>:<numvfs>",
required=False)
opts = parser.parse_args(argv[1:])
return opts
def main(argv=sys.argv, main_logger=None):
opts = parse_opts(argv)
if not main_logger:
main_logger = common.configure_logger(log_file=True)
common.logger_level(main_logger, opts.verbose, opts.debug)
if opts.numvfs:
if re.match(r"^\w+:\d+$", opts.numvfs):
device_name, numvfs = opts.numvfs.split(':')
set_numvfs(device_name, int(numvfs))
else:
main_logger.error(f"Invalid arguments for --numvfs {opts.numvfs}")
return 1
else:
# Configure the PF's
configure_sriov_pf()
# Configure the VFs
configure_sriov_vf()
if __name__ == '__main__':
sys.exit(main(sys.argv))