retry write_sys call on device busy

This change adds a retry_if_busy decorator
to the read_sys and write_sys functions in the filesystem
module that will retry reads and writes up to 5 times with
an linear backoff.

This allows nova to tolerate short periods of time where
sysfs retruns device busy. If the reties are exausted
and offlineing a core fails a warning is log and the failure is
ignored. onling a core is always treated as a hard error if
retries are exausted.

Closes-Bug: #2065927
Change-Id: I2a6a9f243cb403167620405e167a8dd2bbf3fa79
This commit is contained in:
Sean Mooney 2024-05-22 18:59:02 +01:00
parent ee581a5c9d
commit 44c1b48b31
5 changed files with 124 additions and 14 deletions

View File

@ -1147,6 +1147,10 @@ class FileNotFound(NotFound):
msg_fmt = _("File %(file_path)s could not be found.")
class DeviceBusy(NovaException):
msg_fmt = _("device %(file_path)s is busy.")
class ClassNotFound(NotFound):
msg_fmt = _("Class %(class_name)s could not be found: %(exception)s")

View File

@ -12,37 +12,79 @@
"""Functions to address filesystem calls, particularly sysfs."""
import functools
import os
import time
from oslo_log import log as logging
from nova import exception
LOG = logging.getLogger(__name__)
SYS = '/sys'
RETRY_LIMIT = 5
# a retry decorator to handle EBUSY
def retry_if_busy(func):
"""Decorator to retry a function if it raises DeviceBusy.
This decorator will retry the function RETRY_LIMIT=5 times if it raises
DeviceBusy. It will sleep for 1 second on the first retry, 2 seconds on
the second retry, and so on, up to RETRY_LIMIT seconds. If the function
still raises DeviceBusy after RETRY_LIMIT retries, the exception will be
raised.
"""
@functools.wraps(func)
def wrapper(*args, **kwargs):
for i in range(RETRY_LIMIT):
try:
return func(*args, **kwargs)
except exception.DeviceBusy as e:
# if we have retried RETRY_LIMIT times, raise the exception
# otherwise, sleep and retry, i is 0-based so we need
# to add 1 to it
count = i + 1
if count < RETRY_LIMIT:
LOG.debug(
f"File {e.kwargs['file_path']} is busy, "
f"sleeping {count} seconds before retrying")
time.sleep(count)
continue
raise
return wrapper
# NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
@retry_if_busy
def read_sys(path: str) -> str:
"""Reads the content of a file in the sys filesystem.
:param path: relative or absolute. If relative, will be prefixed by /sys.
:returns: contents of that file.
:raises: nova.exception.FileNotFound if we can't read that file.
:raises: nova.exception.DeviceBusy if the file is busy.
"""
try:
# The path can be absolute with a /sys prefix but that's fine.
with open(os.path.join(SYS, path), mode='r') as data:
return data.read()
except (OSError, ValueError) as exc:
except OSError as exc:
# errno 16 is EBUSY, which means the file is busy.
if exc.errno == 16:
raise exception.DeviceBusy(file_path=path) from exc
# convert permission denied to file not found
raise exception.FileNotFound(file_path=path) from exc
except ValueError as exc:
raise exception.FileNotFound(file_path=path) from exc
# NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
# In order to correctly use it, you need to decorate the caller with a specific
# privsep entrypoint.
@retry_if_busy
def write_sys(path: str, data: str) -> None:
"""Writes the content of a file in the sys filesystem with data.
@ -50,10 +92,17 @@ def write_sys(path: str, data: str) -> None:
:param data: the data to write.
:returns: contents of that file.
:raises: nova.exception.FileNotFound if we can't write that file.
:raises: nova.exception.DeviceBusy if the file is busy.
"""
try:
# The path can be absolute with a /sys prefix but that's fine.
with open(os.path.join(SYS, path), mode='w') as fd:
fd.write(data)
except (OSError, ValueError) as exc:
except OSError as exc:
# errno 16 is EBUSY, which means the file is busy.
if exc.errno == 16:
raise exception.DeviceBusy(file_path=path) from exc
# convert permission denied to file not found
raise exception.FileNotFound(file_path=path) from exc
except ValueError as exc:
raise exception.FileNotFound(file_path=path) from exc

View File

@ -19,7 +19,6 @@ from nova import exception
from nova import objects
from nova.tests import fixtures as nova_fixtures
from nova.tests.fixtures import libvirt as fakelibvirt
from nova.tests.functional.api import client
from nova.tests.functional.libvirt import base
from nova.virt import hardware
from nova.virt.libvirt.cpu import api as cpu_api
@ -276,23 +275,24 @@ class PowerManagementTests(PowerManagementTestsBase):
self._assert_cpu_set_state(cpu_dedicated_set, expected='offline')
def test_delete_server_device_busy(self):
# This test verifies bug 2065927 is resolved.
server = self.test_create_server()
inst = objects.Instance.get_by_uuid(self.ctxt, server['id'])
instance_pcpus = inst.numa_topology.cpu_pinning
self._assert_cpu_set_state(instance_pcpus, expected='online')
with mock.patch(
'nova.filesystem.write_sys',
side_effect=exception.FileNotFound(file_path='fake')):
# This is bug 2065927
self.assertRaises(
client.OpenStackApiException, self._delete_server, server)
'nova.filesystem.write_sys.__wrapped__',
side_effect=[
exception.DeviceBusy(file_path='fake'),
None]):
self._delete_server(server)
cpu_dedicated_set = hardware.get_cpu_dedicated_set()
# Verify that the unused CPUs are still offline
unused_cpus = cpu_dedicated_set - instance_pcpus
self._assert_cpu_set_state(unused_cpus, expected='offline')
# but the instance cpus are still online
self._assert_cpu_set_state(instance_pcpus, expected='online')
# and the pinned CPUs are offline
self._assert_cpu_set_state(instance_pcpus, expected='offline')
def test_create_server_with_emulator_threads_isolate(self):
server = self._create_server(

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
import io
import os
from unittest import mock
@ -35,6 +36,29 @@ class TestFSCommon(test.NoDBTestCase):
expected_path = os.path.join(filesystem.SYS, 'foo')
m_open.assert_called_once_with(expected_path, mode='r')
def test_read_sys_retry(self):
open_mock = mock.mock_open()
with mock.patch('builtins.open', open_mock) as m_open:
m_open.side_effect = [
OSError(16, 'Device or resource busy'),
io.StringIO('bar'),
]
self.assertEqual('bar', filesystem.read_sys('foo'))
expected_path = os.path.join(filesystem.SYS, 'foo')
m_open.assert_has_calls([
mock.call(expected_path, mode='r'),
mock.call(expected_path, mode='r'),
])
def test_read_sys_retry_limit(self):
open_mock = mock.mock_open()
with mock.patch('builtins.open', open_mock) as m_open:
m_open.side_effect = (
[OSError(16, 'Device or resource busy')] *
(filesystem.RETRY_LIMIT + 1))
self.assertRaises(
exception.DeviceBusy, filesystem.read_sys, 'foo')
def test_write_sys(self):
open_mock = mock.mock_open()
with mock.patch('builtins.open', open_mock) as m_open:
@ -50,3 +74,26 @@ class TestFSCommon(test.NoDBTestCase):
filesystem.write_sys, 'foo', 'bar')
expected_path = os.path.join(filesystem.SYS, 'foo')
m_open.assert_called_once_with(expected_path, mode='w')
def test_write_sys_retry(self):
open_mock = mock.mock_open()
with mock.patch('builtins.open', open_mock) as m_open:
m_open.side_effect = [
OSError(16, 'Device or resource busy'),
io.StringIO(),
]
self.assertIsNone(filesystem.write_sys('foo', 'bar'))
expected_path = os.path.join(filesystem.SYS, 'foo')
m_open.assert_has_calls([
mock.call(expected_path, mode='w'),
mock.call(expected_path, mode='w'),
])
def test_write_sys_retry_limit(self):
open_mock = mock.mock_open()
with mock.patch('builtins.open', open_mock) as m_open:
m_open.side_effect = (
[OSError(16, 'Device or resource busy')] *
(filesystem.RETRY_LIMIT + 1))
self.assertRaises(
exception.DeviceBusy, filesystem.write_sys, 'foo', 'bar')

View File

@ -56,13 +56,23 @@ def get_online(core: int) -> bool:
@nova.privsep.sys_admin_pctxt.entrypoint
def set_online(core: int) -> bool:
# failure to online a core should be considered a failure
# so we don't catch any exception here.
filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='1')
return get_online(core)
@nova.privsep.sys_admin_pctxt.entrypoint
def set_offline(core: int) -> bool:
filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='0')
try:
filesystem.write_sys(os.path.join(
gen_cpu_path(core), 'online'), data='0')
except exception.DeviceBusy:
# if nova is not able to offline a core it should not break anything
# so we just log a warning and return False to indicate that the core
# is not offline.
LOG.warning('Unable to offline CPU: %s', core)
return False
return not get_online(core)