diff --git a/nova/exception.py b/nova/exception.py index c3ada86c1e21..3ba9ed64cfec 100644 --- a/nova/exception.py +++ b/nova/exception.py @@ -1131,6 +1131,10 @@ class FileNotFound(NotFound): msg_fmt = _("File %(file_path)s could not be found.") +class DeviceBusy(NovaException): + msg_fmt = _("device %(file_path)s is busy.") + + class ClassNotFound(NotFound): msg_fmt = _("Class %(class_name)s could not be found: %(exception)s") diff --git a/nova/filesystem.py b/nova/filesystem.py index 5394d2d8351c..2db3093236e6 100644 --- a/nova/filesystem.py +++ b/nova/filesystem.py @@ -12,37 +12,79 @@ """Functions to address filesystem calls, particularly sysfs.""" +import functools import os +import time from oslo_log import log as logging from nova import exception LOG = logging.getLogger(__name__) - - SYS = '/sys' +RETRY_LIMIT = 5 +# a retry decorator to handle EBUSY +def retry_if_busy(func): + """Decorator to retry a function if it raises DeviceBusy. + + This decorator will retry the function RETRY_LIMIT=5 times if it raises + DeviceBusy. It will sleep for 1 second on the first retry, 2 seconds on + the second retry, and so on, up to RETRY_LIMIT seconds. If the function + still raises DeviceBusy after RETRY_LIMIT retries, the exception will be + raised. + """ + + @functools.wraps(func) + def wrapper(*args, **kwargs): + for i in range(RETRY_LIMIT): + try: + return func(*args, **kwargs) + except exception.DeviceBusy as e: + # if we have retried RETRY_LIMIT times, raise the exception + # otherwise, sleep and retry, i is 0-based so we need + # to add 1 to it + count = i + 1 + if count < RETRY_LIMIT: + LOG.debug( + f"File {e.kwargs['file_path']} is busy, " + f"sleeping {count} seconds before retrying") + time.sleep(count) + continue + raise + return wrapper + # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint + + +@retry_if_busy def read_sys(path: str) -> str: """Reads the content of a file in the sys filesystem. :param path: relative or absolute. If relative, will be prefixed by /sys. :returns: contents of that file. :raises: nova.exception.FileNotFound if we can't read that file. + :raises: nova.exception.DeviceBusy if the file is busy. """ try: # The path can be absolute with a /sys prefix but that's fine. with open(os.path.join(SYS, path), mode='r') as data: return data.read() - except (OSError, ValueError) as exc: + except OSError as exc: + # errno 16 is EBUSY, which means the file is busy. + if exc.errno == 16: + raise exception.DeviceBusy(file_path=path) from exc + # convert permission denied to file not found + raise exception.FileNotFound(file_path=path) from exc + except ValueError as exc: raise exception.FileNotFound(file_path=path) from exc # NOTE(bauzas): this method is deliberately not wrapped in a privsep entrypoint # In order to correctly use it, you need to decorate the caller with a specific # privsep entrypoint. +@retry_if_busy def write_sys(path: str, data: str) -> None: """Writes the content of a file in the sys filesystem with data. @@ -50,10 +92,17 @@ def write_sys(path: str, data: str) -> None: :param data: the data to write. :returns: contents of that file. :raises: nova.exception.FileNotFound if we can't write that file. + :raises: nova.exception.DeviceBusy if the file is busy. """ try: # The path can be absolute with a /sys prefix but that's fine. with open(os.path.join(SYS, path), mode='w') as fd: fd.write(data) - except (OSError, ValueError) as exc: + except OSError as exc: + # errno 16 is EBUSY, which means the file is busy. + if exc.errno == 16: + raise exception.DeviceBusy(file_path=path) from exc + # convert permission denied to file not found + raise exception.FileNotFound(file_path=path) from exc + except ValueError as exc: raise exception.FileNotFound(file_path=path) from exc diff --git a/nova/tests/functional/libvirt/test_power_manage.py b/nova/tests/functional/libvirt/test_power_manage.py index d707168778b5..3839c798317f 100644 --- a/nova/tests/functional/libvirt/test_power_manage.py +++ b/nova/tests/functional/libvirt/test_power_manage.py @@ -19,7 +19,6 @@ from nova import exception from nova import objects from nova.tests import fixtures as nova_fixtures from nova.tests.fixtures import libvirt as fakelibvirt -from nova.tests.functional.api import client from nova.tests.functional.libvirt import base from nova.virt import hardware from nova.virt.libvirt.cpu import api as cpu_api @@ -276,23 +275,24 @@ class PowerManagementTests(PowerManagementTestsBase): self._assert_cpu_set_state(cpu_dedicated_set, expected='offline') def test_delete_server_device_busy(self): + # This test verifies bug 2065927 is resolved. server = self.test_create_server() - inst = objects.Instance.get_by_uuid(self.ctxt, server['id']) instance_pcpus = inst.numa_topology.cpu_pinning self._assert_cpu_set_state(instance_pcpus, expected='online') with mock.patch( - 'nova.filesystem.write_sys', - side_effect=exception.FileNotFound(file_path='fake')): - # This is bug 2065927 - self.assertRaises( - client.OpenStackApiException, self._delete_server, server) + 'nova.filesystem.write_sys.__wrapped__', + side_effect=[ + exception.DeviceBusy(file_path='fake'), + None]): + + self._delete_server(server) cpu_dedicated_set = hardware.get_cpu_dedicated_set() # Verify that the unused CPUs are still offline unused_cpus = cpu_dedicated_set - instance_pcpus self._assert_cpu_set_state(unused_cpus, expected='offline') - # but the instance cpus are still online - self._assert_cpu_set_state(instance_pcpus, expected='online') + # and the pinned CPUs are offline + self._assert_cpu_set_state(instance_pcpus, expected='offline') def test_create_server_with_emulator_threads_isolate(self): server = self._create_server( diff --git a/nova/tests/unit/test_filesystem.py b/nova/tests/unit/test_filesystem.py index 85f16157eebe..5c3107472d83 100644 --- a/nova/tests/unit/test_filesystem.py +++ b/nova/tests/unit/test_filesystem.py @@ -10,6 +10,7 @@ # License for the specific language governing permissions and limitations # under the License. +import io import os from unittest import mock @@ -35,6 +36,29 @@ class TestFSCommon(test.NoDBTestCase): expected_path = os.path.join(filesystem.SYS, 'foo') m_open.assert_called_once_with(expected_path, mode='r') + def test_read_sys_retry(self): + open_mock = mock.mock_open() + with mock.patch('builtins.open', open_mock) as m_open: + m_open.side_effect = [ + OSError(16, 'Device or resource busy'), + io.StringIO('bar'), + ] + self.assertEqual('bar', filesystem.read_sys('foo')) + expected_path = os.path.join(filesystem.SYS, 'foo') + m_open.assert_has_calls([ + mock.call(expected_path, mode='r'), + mock.call(expected_path, mode='r'), + ]) + + def test_read_sys_retry_limit(self): + open_mock = mock.mock_open() + with mock.patch('builtins.open', open_mock) as m_open: + m_open.side_effect = ( + [OSError(16, 'Device or resource busy')] * + (filesystem.RETRY_LIMIT + 1)) + self.assertRaises( + exception.DeviceBusy, filesystem.read_sys, 'foo') + def test_write_sys(self): open_mock = mock.mock_open() with mock.patch('builtins.open', open_mock) as m_open: @@ -50,3 +74,26 @@ class TestFSCommon(test.NoDBTestCase): filesystem.write_sys, 'foo', 'bar') expected_path = os.path.join(filesystem.SYS, 'foo') m_open.assert_called_once_with(expected_path, mode='w') + + def test_write_sys_retry(self): + open_mock = mock.mock_open() + with mock.patch('builtins.open', open_mock) as m_open: + m_open.side_effect = [ + OSError(16, 'Device or resource busy'), + io.StringIO(), + ] + self.assertIsNone(filesystem.write_sys('foo', 'bar')) + expected_path = os.path.join(filesystem.SYS, 'foo') + m_open.assert_has_calls([ + mock.call(expected_path, mode='w'), + mock.call(expected_path, mode='w'), + ]) + + def test_write_sys_retry_limit(self): + open_mock = mock.mock_open() + with mock.patch('builtins.open', open_mock) as m_open: + m_open.side_effect = ( + [OSError(16, 'Device or resource busy')] * + (filesystem.RETRY_LIMIT + 1)) + self.assertRaises( + exception.DeviceBusy, filesystem.write_sys, 'foo', 'bar') diff --git a/nova/virt/libvirt/cpu/core.py b/nova/virt/libvirt/cpu/core.py index 8274236850cc..2d71bd60e40d 100644 --- a/nova/virt/libvirt/cpu/core.py +++ b/nova/virt/libvirt/cpu/core.py @@ -56,13 +56,23 @@ def get_online(core: int) -> bool: @nova.privsep.sys_admin_pctxt.entrypoint def set_online(core: int) -> bool: + # failure to online a core should be considered a failure + # so we don't catch any exception here. filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='1') return get_online(core) @nova.privsep.sys_admin_pctxt.entrypoint def set_offline(core: int) -> bool: - filesystem.write_sys(os.path.join(gen_cpu_path(core), 'online'), data='0') + try: + filesystem.write_sys(os.path.join( + gen_cpu_path(core), 'online'), data='0') + except exception.DeviceBusy: + # if nova is not able to offline a core it should not break anything + # so we just log a warning and return False to indicate that the core + # is not offline. + LOG.warning('Unable to offline CPU: %s', core) + return False return not get_online(core)