ironic-python-agent/ironic_python_agent/numa_inspector.py
David Hill a1a121fbf8 Skip nic numa_node discovery if it's not assigned to a numa_node
In some rare case, such as a VM with virtual numa node, nics might
not be in a numa node and this breaks numa-topology discovery.

Change-Id: I52f52119a5f3175b1723fde291eb26d4f86c8223
Story: 2007105
Task: 38151
2020-01-17 11:15:35 +01:00

266 lines
10 KiB
Python

# Copyright 2017 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from oslo_log import log
import pint
from ironic_python_agent import errors
LOG = log.getLogger(__name__)
UNIT_CONVERTER = pint.UnitRegistry(filename=None)
UNIT_CONVERTER.define('kB = []')
UNIT_CONVERTER.define('KB = []')
UNIT_CONVERTER.define('MB = 1024 KB')
UNIT_CONVERTER.define('GB = 1048576 KB')
def get_numa_node_id(numa_node_dir):
"""Provides the NUMA node id from NUMA node directory
:param numa_node_dir: NUMA node directory
:raises: IncompatibleNumaFormatError: when unexpected format data
in NUMA node dir
:return: NUMA node id
"""
try:
return int(os.path.basename(numa_node_dir)[4:])
except (IOError, ValueError, IndexError) as exc:
msg = ('Failed to get NUMA node id for %(node)s: '
'%(error)s' % {'node': numa_node_dir, 'error': exc})
raise errors.IncompatibleNumaFormatError(msg)
def get_nodes_memory_info(numa_node_dirs):
"""Collect the NUMA nodes memory information.
The information is returned in the form of::
"ram": [{"numa_node": <numa_node_id>, "size_kb": <memory_in_kb>}, ...]
:param numa_node_dirs: A list of NUMA node directories
:raises: IncompatibleNumaFormatError: when unexpected format data
in NUMA node
:return: A list of memory information with NUMA node id
"""
ram = []
for numa_node_dir in numa_node_dirs:
numa_node_memory = {}
numa_node_id = get_numa_node_id(numa_node_dir)
try:
with open(os.path.join(numa_node_dir,
'meminfo')) as meminfo_file:
for line in meminfo_file:
if 'MemTotal' in line:
break
else:
msg = ('Memory information is not available for '
'%(node)s' % {'node': numa_node_dir})
raise errors.IncompatibleNumaFormatError(msg)
except IOError as exc:
msg = ('Failed to get memory information '
'for %(node)s: %(error)s' %
{'node': numa_node_dir, 'error': exc})
raise errors.IncompatibleNumaFormatError(msg)
try:
# To get memory size with unit from memory info line
# Memory info sample line format 'Node 0 MemTotal: 1560000 kB'
value = line.split(":")[1].strip()
memory_kb = int(UNIT_CONVERTER(value).to_base_units().magnitude)
except (ValueError, IndexError, pint.UndefinedUnitError) as exc:
msg = ('Failed to get memory information for %(node)s: '
'%(error)s' % {'node': numa_node_dir, 'error': exc})
raise errors.IncompatibleNumaFormatError(msg)
numa_node_memory['numa_node'] = numa_node_id
numa_node_memory['size_kb'] = memory_kb
LOG.debug('Found memory available %d KB in NUMA node %d',
memory_kb, numa_node_id)
ram.append(numa_node_memory)
return ram
def get_nodes_cores_info(numa_node_dirs):
"""Collect the NUMA nodes cpu's and thread's information.
NUMA nodes path: /sys/devices/system/node/node<node_id>
Thread dirs path: /sys/devices/system/node/node<node_id>/cpu<thread_id>
CPU id file path: /sys/devices/system/node/node<node_id>/cpu<thread_id>/
topology/core_id
The information is returned in the form of::
"cpus": [
{
"cpu": <cpu_id>, "numa_node": <numa_node_id>,
"thread_siblings": [<list of sibling threads>]
},
...,
]
:param numa_node_dirs: A list of NUMA node directories
:raises: IncompatibleNumaFormatError: when unexpected format data
in NUMA node
:return: A list of cpu information with NUMA node id and thread siblings
"""
dict_cpus = {}
for numa_node_dir in numa_node_dirs:
numa_node_id = get_numa_node_id(numa_node_dir)
try:
thread_dirs = os.listdir(numa_node_dir)
except OSError as exc:
msg = ('Failed to get list of threads for %(node)s: '
'%(error)s' % {'node': numa_node_dir, 'error': exc})
raise errors.IncompatibleNumaFormatError(msg)
for thread_dir in thread_dirs:
if (not os.path.isdir(os.path.join(numa_node_dir, thread_dir))
or not thread_dir.startswith("cpu")):
continue
try:
thread_id = int(thread_dir[3:])
except (ValueError, IndexError) as exc:
msg = ('Failed to get cores information for '
'%(node)s: %(error)s' %
{'node': numa_node_dir, 'error': exc})
raise errors.IncompatibleNumaFormatError(msg)
try:
with open(os.path.join(numa_node_dir, thread_dir, 'topology',
'core_id')) as core_id_file:
cpu_id = int(core_id_file.read().strip())
except (IOError, ValueError) as exc:
msg = ('Failed to gather cpu_id for thread'
'%(thread)s NUMA node %(node)s: %(error)s' %
{'thread': thread_dir, 'node': numa_node_dir,
'error': exc})
raise errors.IncompatibleNumaFormatError(msg)
# CPU and NUMA node together forms a unique value, as cpu_id is
# specific to a NUMA node
# NUMA node id and cpu id tuple is used for unique key
dict_key = numa_node_id, cpu_id
if dict_key in dict_cpus:
if thread_id not in dict_cpus[dict_key]['thread_siblings']:
dict_cpus[dict_key]['thread_siblings'].append(thread_id)
else:
cpu_item = {}
cpu_item['thread_siblings'] = [thread_id]
cpu_item['cpu'] = cpu_id
cpu_item['numa_node'] = numa_node_id
dict_cpus[dict_key] = cpu_item
LOG.debug('Found a thread sibling %d for CPU %d in NUMA node %d',
thread_id, cpu_id, numa_node_id)
return list(dict_cpus.values())
def get_nodes_nics_info(nic_device_path):
"""Collect the NUMA nodes nics information.
The information is returned in the form of::
"nics": [
{"name": "<network interface name>",
"numa_node": <numa_node_id>},
...,
]
:param nic_device_path: nic device directory path
:raises: IncompatibleNumaFormatError: when unexpected format data
in NUMA node
:return: A list of nics information with NUMA node id
"""
nics = []
if not os.path.isdir(nic_device_path):
msg = ('Failed to get list of NIC\'s, NIC device path '
'does not exist: %(nic_device_path)s' %
{'nic_device_path': nic_device_path})
raise errors.IncompatibleNumaFormatError(msg)
for nic_dir in os.listdir(nic_device_path):
if not os.path.isfile(os.path.join(nic_device_path,
nic_dir, 'device', 'numa_node')):
continue
try:
with open(os.path.join(nic_device_path, nic_dir, 'device',
'numa_node')) as nicsinfo_file:
numa_node_id = int(nicsinfo_file.read().strip())
except (IOError, ValueError) as exc:
msg = ('Failed to gather NIC\'s for NUMA node %(node)s: '
'%(error)s' % {'node': nic_dir, 'error': exc})
raise errors.IncompatibleNumaFormatError(msg)
numa_node_nics = {}
numa_node_nics['name'] = nic_dir
numa_node_nics['numa_node'] = numa_node_id
LOG.debug('Found a NIC %s in NUMA node %d', nic_dir,
numa_node_id)
nics.append(numa_node_nics)
return nics
def collect_numa_topology_info(data, failures):
"""Collect the NUMA topology information.
The data is gathered from /sys/devices/system/node/node<X> and
/sys/class/net/ directories. The information is collected in the form of::
{
"numa_topology": {
"ram": [{"numa_node": <numa_node_id>, "size_kb": <memory_in_kb>},
...],
"cpus": [
{
"cpu": <cpu_id>, "numa_node": <numa_node_id>,
"thread_siblings": [<list of sibling threads>]
},
...,
],
"nics": [
{"name": "<network interface name>", "numa_node": <numa_node_id>},
...,
]
}
}
:param data: mutable data that we'll send to inspector
:param failures: AccumulatedFailures object
:return: None
"""
numa_node_path = '/sys/devices/system/node/'
nic_device_path = '/sys/class/net/'
numa_info = {}
numa_node_dirs = []
if not os.path.isdir(numa_node_path):
LOG.warning('Failed to get list of NUMA nodes, NUMA node path '
'does not exist: %s', numa_node_path)
return
for numa_node_dir in os.listdir(numa_node_path):
numa_node_dir_path = os.path.join(numa_node_path, numa_node_dir)
if (os.path.isdir(numa_node_dir_path)
and numa_node_dir.startswith("node")):
numa_node_dirs.append(numa_node_dir_path)
try:
numa_info['ram'] = get_nodes_memory_info(numa_node_dirs)
numa_info['cpus'] = get_nodes_cores_info(numa_node_dirs)
numa_info['nics'] = get_nodes_nics_info(nic_device_path)
except errors.IncompatibleNumaFormatError as exc:
LOG.warning('Failed to get some NUMA information (%s)', exc)
return
data['numa_topology'] = numa_info