Files
nova/nova/virt/libvirt/cpu/core.py
Sean Mooney 44c1b48b31 retry write_sys call on device busy
This change adds a retry_if_busy decorator
to the read_sys and write_sys functions in the filesystem
module that will retry reads and writes up to 5 times with
an linear backoff.

This allows nova to tolerate short periods of time where
sysfs retruns device busy. If the reties are exausted
and offlineing a core fails a warning is log and the failure is
ignored. onling a core is always treated as a hard error if
retries are exausted.

Closes-Bug: #2065927
Change-Id: I2a6a9f243cb403167620405e167a8dd2bbf3fa79
2024-05-27 18:31:31 +01:00

90 lines
2.8 KiB
Python

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import os
import typing as ty
from oslo_log import log as logging
from nova import exception
from nova import filesystem
import nova.privsep
from nova.virt import hardware
LOG = logging.getLogger(__name__)
AVAILABLE_PATH = '/sys/devices/system/cpu/present'
CPU_PATH_TEMPLATE = '/sys/devices/system/cpu/cpu%(core)s'
def get_available_cores() -> ty.Set[int]:
cores = filesystem.read_sys(AVAILABLE_PATH)
return hardware.parse_cpu_spec(cores) if cores else set()
def exists(core: int) -> bool:
return core in get_available_cores()
def gen_cpu_path(core: int) -> str:
if not exists(core):
LOG.warning('Unable to access CPU: %s', core)
raise ValueError('CPU: %(core)s does not exist', core)
return CPU_PATH_TEMPLATE % {'core': core}
def get_online(core: int) -> bool:
try:
online = filesystem.read_sys(
os.path.join(gen_cpu_path(core), 'online')).strip()
except exception.FileNotFound:
# The online file may not exist if we haven't written it yet.
# By default, this means that the CPU is online.
online = '1'
return online == '1'
@nova.privsep.sys_admin_pctxt.entrypoint
def set_online(core: int) -> bool:
# failure to online a core should be considered a failure
# so we don't catch any exception here.
filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='1')
return get_online(core)
@nova.privsep.sys_admin_pctxt.entrypoint
def set_offline(core: int) -> bool:
try:
filesystem.write_sys(os.path.join(
gen_cpu_path(core), 'online'), data='0')
except exception.DeviceBusy:
# if nova is not able to offline a core it should not break anything
# so we just log a warning and return False to indicate that the core
# is not offline.
LOG.warning('Unable to offline CPU: %s', core)
return False
return not get_online(core)
def get_governor(core: int) -> str:
return filesystem.read_sys(
os.path.join(gen_cpu_path(core), 'cpufreq/scaling_governor')).strip()
@nova.privsep.sys_admin_pctxt.entrypoint
def set_governor(core: int, governor: str) -> str:
filesystem.write_sys(
os.path.join(gen_cpu_path(core), 'cpufreq/scaling_governor'),
data=governor)
return get_governor(core)