retry write_sys call on device busy
This change adds a retry_if_busy decorator to the read_sys and write_sys functions in the filesystem module that will retry reads and writes up to 5 times with an linear backoff. This allows nova to tolerate short periods of time where sysfs retruns device busy. If the reties are exausted and offlineing a core fails a warning is log and the failure is ignored. onling a core is always treated as a hard error if retries are exausted. Closes-Bug: #2065927 Change-Id: I2a6a9f243cb403167620405e167a8dd2bbf3fa79
This commit is contained in:
parent
ee581a5c9d
commit
44c1b48b31
@ -1147,6 +1147,10 @@ class FileNotFound(NotFound):
|
||||
msg_fmt = _("File %(file_path)s could not be found.")
|
||||
|
||||
|
||||
class DeviceBusy(NovaException):
|
||||
msg_fmt = _("device %(file_path)s is busy.")
|
||||
|
||||
|
||||
class ClassNotFound(NotFound):
|
||||
msg_fmt = _("Class %(class_name)s could not be found: %(exception)s")
|
||||
|
||||
|
@ -12,37 +12,79 @@
|
||||
|
||||
"""Functions to address filesystem calls, particularly sysfs."""
|
||||
|
||||
import functools
|
||||
import os
|
||||
import time
|
||||
|
||||
from oslo_log import log as logging
|
||||
|
||||
from nova import exception
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
SYS = '/sys'
|
||||
RETRY_LIMIT = 5
|
||||
|
||||
|
||||
# a retry decorator to handle EBUSY
|
||||
def retry_if_busy(func):
|
||||
"""Decorator to retry a function if it raises DeviceBusy.
|
||||
|
||||
This decorator will retry the function RETRY_LIMIT=5 times if it raises
|
||||
DeviceBusy. It will sleep for 1 second on the first retry, 2 seconds on
|
||||
the second retry, and so on, up to RETRY_LIMIT seconds. If the function
|
||||
still raises DeviceBusy after RETRY_LIMIT retries, the exception will be
|
||||
raised.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
for i in range(RETRY_LIMIT):
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except exception.DeviceBusy as e:
|
||||
# if we have retried RETRY_LIMIT times, raise the exception
|
||||
# otherwise, sleep and retry, i is 0-based so we need
|
||||
# to add 1 to it
|
||||
count = i + 1
|
||||
if count < RETRY_LIMIT:
|
||||
LOG.debug(
|
||||
f"File {e.kwargs['file_path']} is busy, "
|
||||
f"sleeping {count} seconds before retrying")
|
||||
time.sleep(count)
|
||||
continue
|
||||
raise
|
||||
return wrapper
|
||||
|
||||
# NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
|
||||
|
||||
|
||||
@retry_if_busy
|
||||
def read_sys(path: str) -> str:
|
||||
"""Reads the content of a file in the sys filesystem.
|
||||
|
||||
:param path: relative or absolute. If relative, will be prefixed by /sys.
|
||||
:returns: contents of that file.
|
||||
:raises: nova.exception.FileNotFound if we can't read that file.
|
||||
:raises: nova.exception.DeviceBusy if the file is busy.
|
||||
"""
|
||||
try:
|
||||
# The path can be absolute with a /sys prefix but that's fine.
|
||||
with open(os.path.join(SYS, path), mode='r') as data:
|
||||
return data.read()
|
||||
except (OSError, ValueError) as exc:
|
||||
except OSError as exc:
|
||||
# errno 16 is EBUSY, which means the file is busy.
|
||||
if exc.errno == 16:
|
||||
raise exception.DeviceBusy(file_path=path) from exc
|
||||
# convert permission denied to file not found
|
||||
raise exception.FileNotFound(file_path=path) from exc
|
||||
except ValueError as exc:
|
||||
raise exception.FileNotFound(file_path=path) from exc
|
||||
|
||||
|
||||
# NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint
|
||||
# In order to correctly use it, you need to decorate the caller with a specific
|
||||
# privsep entrypoint.
|
||||
@retry_if_busy
|
||||
def write_sys(path: str, data: str) -> None:
|
||||
"""Writes the content of a file in the sys filesystem with data.
|
||||
|
||||
@ -50,10 +92,17 @@ def write_sys(path: str, data: str) -> None:
|
||||
:param data: the data to write.
|
||||
:returns: contents of that file.
|
||||
:raises: nova.exception.FileNotFound if we can't write that file.
|
||||
:raises: nova.exception.DeviceBusy if the file is busy.
|
||||
"""
|
||||
try:
|
||||
# The path can be absolute with a /sys prefix but that's fine.
|
||||
with open(os.path.join(SYS, path), mode='w') as fd:
|
||||
fd.write(data)
|
||||
except (OSError, ValueError) as exc:
|
||||
except OSError as exc:
|
||||
# errno 16 is EBUSY, which means the file is busy.
|
||||
if exc.errno == 16:
|
||||
raise exception.DeviceBusy(file_path=path) from exc
|
||||
# convert permission denied to file not found
|
||||
raise exception.FileNotFound(file_path=path) from exc
|
||||
except ValueError as exc:
|
||||
raise exception.FileNotFound(file_path=path) from exc
|
||||
|
@ -19,7 +19,6 @@ from nova import exception
|
||||
from nova import objects
|
||||
from nova.tests import fixtures as nova_fixtures
|
||||
from nova.tests.fixtures import libvirt as fakelibvirt
|
||||
from nova.tests.functional.api import client
|
||||
from nova.tests.functional.libvirt import base
|
||||
from nova.virt import hardware
|
||||
from nova.virt.libvirt.cpu import api as cpu_api
|
||||
@ -276,23 +275,24 @@ class PowerManagementTests(PowerManagementTestsBase):
|
||||
self._assert_cpu_set_state(cpu_dedicated_set, expected='offline')
|
||||
|
||||
def test_delete_server_device_busy(self):
|
||||
# This test verifies bug 2065927 is resolved.
|
||||
server = self.test_create_server()
|
||||
|
||||
inst = objects.Instance.get_by_uuid(self.ctxt, server['id'])
|
||||
instance_pcpus = inst.numa_topology.cpu_pinning
|
||||
self._assert_cpu_set_state(instance_pcpus, expected='online')
|
||||
with mock.patch(
|
||||
'nova.filesystem.write_sys',
|
||||
side_effect=exception.FileNotFound(file_path='fake')):
|
||||
# This is bug 2065927
|
||||
self.assertRaises(
|
||||
client.OpenStackApiException, self._delete_server, server)
|
||||
'nova.filesystem.write_sys.__wrapped__',
|
||||
side_effect=[
|
||||
exception.DeviceBusy(file_path='fake'),
|
||||
None]):
|
||||
|
||||
self._delete_server(server)
|
||||
cpu_dedicated_set = hardware.get_cpu_dedicated_set()
|
||||
# Verify that the unused CPUs are still offline
|
||||
unused_cpus = cpu_dedicated_set - instance_pcpus
|
||||
self._assert_cpu_set_state(unused_cpus, expected='offline')
|
||||
# but the instance cpus are still online
|
||||
self._assert_cpu_set_state(instance_pcpus, expected='online')
|
||||
# and the pinned CPUs are offline
|
||||
self._assert_cpu_set_state(instance_pcpus, expected='offline')
|
||||
|
||||
def test_create_server_with_emulator_threads_isolate(self):
|
||||
server = self._create_server(
|
||||
|
@ -10,6 +10,7 @@
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import io
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
@ -35,6 +36,29 @@ class TestFSCommon(test.NoDBTestCase):
|
||||
expected_path = os.path.join(filesystem.SYS, 'foo')
|
||||
m_open.assert_called_once_with(expected_path, mode='r')
|
||||
|
||||
def test_read_sys_retry(self):
|
||||
open_mock = mock.mock_open()
|
||||
with mock.patch('builtins.open', open_mock) as m_open:
|
||||
m_open.side_effect = [
|
||||
OSError(16, 'Device or resource busy'),
|
||||
io.StringIO('bar'),
|
||||
]
|
||||
self.assertEqual('bar', filesystem.read_sys('foo'))
|
||||
expected_path = os.path.join(filesystem.SYS, 'foo')
|
||||
m_open.assert_has_calls([
|
||||
mock.call(expected_path, mode='r'),
|
||||
mock.call(expected_path, mode='r'),
|
||||
])
|
||||
|
||||
def test_read_sys_retry_limit(self):
|
||||
open_mock = mock.mock_open()
|
||||
with mock.patch('builtins.open', open_mock) as m_open:
|
||||
m_open.side_effect = (
|
||||
[OSError(16, 'Device or resource busy')] *
|
||||
(filesystem.RETRY_LIMIT + 1))
|
||||
self.assertRaises(
|
||||
exception.DeviceBusy, filesystem.read_sys, 'foo')
|
||||
|
||||
def test_write_sys(self):
|
||||
open_mock = mock.mock_open()
|
||||
with mock.patch('builtins.open', open_mock) as m_open:
|
||||
@ -50,3 +74,26 @@ class TestFSCommon(test.NoDBTestCase):
|
||||
filesystem.write_sys, 'foo', 'bar')
|
||||
expected_path = os.path.join(filesystem.SYS, 'foo')
|
||||
m_open.assert_called_once_with(expected_path, mode='w')
|
||||
|
||||
def test_write_sys_retry(self):
|
||||
open_mock = mock.mock_open()
|
||||
with mock.patch('builtins.open', open_mock) as m_open:
|
||||
m_open.side_effect = [
|
||||
OSError(16, 'Device or resource busy'),
|
||||
io.StringIO(),
|
||||
]
|
||||
self.assertIsNone(filesystem.write_sys('foo', 'bar'))
|
||||
expected_path = os.path.join(filesystem.SYS, 'foo')
|
||||
m_open.assert_has_calls([
|
||||
mock.call(expected_path, mode='w'),
|
||||
mock.call(expected_path, mode='w'),
|
||||
])
|
||||
|
||||
def test_write_sys_retry_limit(self):
|
||||
open_mock = mock.mock_open()
|
||||
with mock.patch('builtins.open', open_mock) as m_open:
|
||||
m_open.side_effect = (
|
||||
[OSError(16, 'Device or resource busy')] *
|
||||
(filesystem.RETRY_LIMIT + 1))
|
||||
self.assertRaises(
|
||||
exception.DeviceBusy, filesystem.write_sys, 'foo', 'bar')
|
||||
|
@ -56,13 +56,23 @@ def get_online(core: int) -> bool:
|
||||
|
||||
@nova.privsep.sys_admin_pctxt.entrypoint
|
||||
def set_online(core: int) -> bool:
|
||||
# failure to online a core should be considered a failure
|
||||
# so we don't catch any exception here.
|
||||
filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='1')
|
||||
return get_online(core)
|
||||
|
||||
|
||||
@nova.privsep.sys_admin_pctxt.entrypoint
|
||||
def set_offline(core: int) -> bool:
|
||||
filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='0')
|
||||
try:
|
||||
filesystem.write_sys(os.path.join(
|
||||
gen_cpu_path(core), 'online'), data='0')
|
||||
except exception.DeviceBusy:
|
||||
# if nova is not able to offline a core it should not break anything
|
||||
# so we just log a warning and return False to indicate that the core
|
||||
# is not offline.
|
||||
LOG.warning('Unable to offline CPU: %s', core)
|
||||
return False
|
||||
return not get_online(core)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user