diff --git a/lower-constraints.txt b/lower-constraints.txt index 97a98fe814df..482f9012a024 100644 --- a/lower-constraints.txt +++ b/lower-constraints.txt @@ -81,7 +81,7 @@ oslo.context==2.19.2 oslo.db==4.27.0 oslo.i18n==3.15.3 oslo.log==3.36.0 -oslo.messaging==5.29.0 +oslo.messaging==6.3.0 oslo.middleware==3.31.0 oslo.policy==1.35.0 oslo.privsep==1.23.0 diff --git a/nova/compute/rpcapi.py b/nova/compute/rpcapi.py index 56a9eb6e48e9..9ec082620b0b 100644 --- a/nova/compute/rpcapi.py +++ b/nova/compute/rpcapi.py @@ -371,6 +371,11 @@ class ComputeAPI(object): version_cap = self.VERSION_ALIASES.get(upgrade_level, upgrade_level) serializer = objects_base.NovaObjectSerializer() + + # NOTE(danms): We need to poke this path to register CONF options + # that we use in self.get_client() + rpc.get_client(target, version_cap, serializer) + default_client = self.get_client(target, version_cap, serializer) self.router = rpc.ClientRouter(default_client) @@ -417,9 +422,20 @@ class ComputeAPI(object): # Cells overrides this def get_client(self, target, version_cap, serializer): + if CONF.rpc_response_timeout > rpc.HEARTBEAT_THRESHOLD: + # NOTE(danms): If the operator has overridden RPC timeout + # to be longer than rpc.HEARTBEAT_THRESHOLD then configure + # the call monitor timeout to be the threshold to keep the + # failure timing characteristics that our code likely + # expects (from history) while allowing healthy calls + # to run longer. + cmt = rpc.HEARTBEAT_THRESHOLD + else: + cmt = None return rpc.get_client(target, version_cap=version_cap, - serializer=serializer) + serializer=serializer, + call_monitor_timeout=cmt) def add_aggregate_host(self, ctxt, host, aggregate, host_param, slave_info=None): @@ -684,7 +700,9 @@ class ComputeAPI(object): host, migrate_data): version = '5.0' client = self.router.client(ctxt) - cctxt = client.prepare(server=host, version=version) + cctxt = client.prepare(server=host, version=version, + timeout=CONF.long_rpc_timeout, + call_monitor_timeout=CONF.rpc_response_timeout) return cctxt.call(ctxt, 'pre_live_migration', instance=instance, block_migration=block_migration, diff --git a/nova/conf/__init__.py b/nova/conf/__init__.py index 377435add797..7649e3d66984 100644 --- a/nova/conf/__init__.py +++ b/nova/conf/__init__.py @@ -55,6 +55,7 @@ from nova.conf import powervm from nova.conf import quota from nova.conf import rdp from nova.conf import remote_debug +from nova.conf import rpc from nova.conf import scheduler from nova.conf import serial_console from nova.conf import service @@ -107,6 +108,7 @@ placement.register_opts(CONF) powervm.register_opts(CONF) quota.register_opts(CONF) rdp.register_opts(CONF) +rpc.register_opts(CONF) scheduler.register_opts(CONF) serial_console.register_opts(CONF) service.register_opts(CONF) diff --git a/nova/conf/rpc.py b/nova/conf/rpc.py new file mode 100644 index 000000000000..a74ef10de451 --- /dev/null +++ b/nova/conf/rpc.py @@ -0,0 +1,46 @@ +# Copyright 2018 OpenStack Foundation +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from oslo_config import cfg + +rpc_opts = [ + cfg.IntOpt("long_rpc_timeout", + default=1800, + help=""" +This option allows setting an alternate timeout value for RPC calls +that have the potential to take a long time. If set, RPC calls to +other services will use this value for the timeout (in seconds) +instead of the global rpc_response_timeout value. + +Operations with RPC calls that utilize this value: + +* live migration + +Related options: + +* rpc_response_timeout +"""), +] + + +ALL_OPTS = rpc_opts + + +def register_opts(conf): + conf.register_opts(ALL_OPTS) + + +def list_opts(): + return {'DEFAULT': ALL_OPTS} diff --git a/nova/rpc.py b/nova/rpc.py index 7ed87dde5a11..6d7de3d67690 100644 --- a/nova/rpc.py +++ b/nova/rpc.py @@ -51,6 +51,10 @@ LEGACY_NOTIFIER = None NOTIFICATION_TRANSPORT = None NOTIFIER = None +# NOTE(danms): If rpc_response_timeout is over this value (per-call or +# globally), we will enable heartbeating +HEARTBEAT_THRESHOLD = 60 + ALLOWED_EXMODS = [ nova.exception.__name__, ] @@ -172,7 +176,8 @@ def get_transport_url(url_str=None): return messaging.TransportURL.parse(CONF, url_str) -def get_client(target, version_cap=None, serializer=None): +def get_client(target, version_cap=None, serializer=None, + call_monitor_timeout=None): assert TRANSPORT is not None if profiler: @@ -183,7 +188,8 @@ def get_client(target, version_cap=None, serializer=None): return messaging.RPCClient(TRANSPORT, target, version_cap=version_cap, - serializer=serializer) + serializer=serializer, + call_monitor_timeout=call_monitor_timeout) def get_server(target, endpoints, serializer=None): @@ -413,8 +419,10 @@ class ClientRouter(periodic_task.PeriodicTasks): def client(self, context): transport = context.mq_connection if transport: + cmt = self.default_client.call_monitor_timeout return messaging.RPCClient(transport, self.target, version_cap=self.version_cap, - serializer=self.serializer) + serializer=self.serializer, + call_monitor_timeout=cmt) else: return self.default_client diff --git a/nova/tests/fixtures.py b/nova/tests/fixtures.py index 7aa5e816b56d..162f7e4033d4 100644 --- a/nova/tests/fixtures.py +++ b/nova/tests/fixtures.py @@ -484,13 +484,15 @@ class CellDatabases(fixtures.Fixture): executor='eventlet', serializer=serializer) - def _wrap_get_client(self, target, version_cap=None, serializer=None): + def _wrap_get_client(self, target, version_cap=None, serializer=None, + call_monitor_timeout=None): """Mirror rpc.get_client() but with our special sauce.""" serializer = CheatingSerializer(serializer) return messaging.RPCClient(rpc.TRANSPORT, target, version_cap=version_cap, - serializer=serializer) + serializer=serializer, + call_monitor_timeout=call_monitor_timeout) def add_cell_database(self, connection_str, default=False): """Add a cell database to the fixture. diff --git a/nova/tests/unit/compute/test_rpcapi.py b/nova/tests/unit/compute/test_rpcapi.py index 415b16ec1afb..03100f721818 100644 --- a/nova/tests/unit/compute/test_rpcapi.py +++ b/nova/tests/unit/compute/test_rpcapi.py @@ -122,6 +122,14 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): base_version = rpcapi.router.target.version expected_version = kwargs.pop('version', base_version) + prepare_extra_kwargs = {} + cm_timeout = kwargs.pop('call_monitor_timeout', None) + timeout = kwargs.pop('timeout', None) + if cm_timeout: + prepare_extra_kwargs['call_monitor_timeout'] = cm_timeout + if timeout: + prepare_extra_kwargs['timeout'] = timeout + expected_kwargs = kwargs.copy() if expected_args: expected_kwargs.update(expected_args) @@ -170,7 +178,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): self.assertEqual(retval, rpc_mock.return_value) prepare_mock.assert_called_once_with(version=expected_version, - server=host) + server=host, + **prepare_extra_kwargs) rpc_mock.assert_called_once_with(ctxt, method, **expected_kwargs) def test_add_aggregate_host(self): @@ -354,10 +363,12 @@ class ComputeRpcAPITestCase(test.NoDBTestCase): instance=self.fake_instance_obj, version='5.0') def test_pre_live_migration(self): + self.flags(long_rpc_timeout=1234) self._test_compute_api('pre_live_migration', 'call', instance=self.fake_instance_obj, block_migration='block_migration', disk='disk', host='host', - migrate_data=None, version='5.0') + migrate_data=None, version='5.0', + call_monitor_timeout=60, timeout=1234) def test_prep_resize(self): expected_args = {'migration': 'migration'} diff --git a/nova/tests/unit/test_rpc.py b/nova/tests/unit/test_rpc.py index 0d8d3842ae76..ef89921829be 100644 --- a/nova/tests/unit/test_rpc.py +++ b/nova/tests/unit/test_rpc.py @@ -208,6 +208,7 @@ class TestRPC(test.NoDBTestCase): mock_ser.assert_called_once_with('foo') mock_client.assert_called_once_with(rpc.TRANSPORT, tgt, version_cap='1.0', + call_monitor_timeout=None, serializer=ser) self.assertEqual('client', client) @@ -246,6 +247,7 @@ class TestRPC(test.NoDBTestCase): mock_ser.assert_called_once_with('foo') mock_client.assert_called_once_with(rpc.TRANSPORT, tgt, version_cap='1.0', + call_monitor_timeout=None, serializer=ser) self.assertEqual('client', client) @@ -472,6 +474,7 @@ class TestClientRouter(test.NoDBTestCase): mock_rpcclient.assert_called_once_with( mock.sentinel.transport, default_client.target, version_cap=default_client.version_cap, + call_monitor_timeout=default_client.call_monitor_timeout, serializer=default_client.serializer) # verify cell client was returned self.assertEqual(cell_client, client) diff --git a/releasenotes/notes/rpc_timeout_changes-6b7e365bb44f7f3a.yaml b/releasenotes/notes/rpc_timeout_changes-6b7e365bb44f7f3a.yaml new file mode 100644 index 000000000000..4ff4c02c56a0 --- /dev/null +++ b/releasenotes/notes/rpc_timeout_changes-6b7e365bb44f7f3a.yaml @@ -0,0 +1,21 @@ +--- +features: + - | + Utilizing recent changes in oslo.messaging, the + `rpc_response_timeout` value can now be increased significantly if + needed or desired to solve issues with long-running RPC calls + timing out before completing due to legitimate reasons (such as + live migration prep). If `rpc_response_timeout` is increased + beyond the default, nova will request active call monitoring from + oslo.messaging, which will effectively heartbeat running + activities to avoid a timeout, while still detecting failures + related to service outages or message bus congestion in a + reasonable amount of time. Further, the + `[DEFAULT]/long_rpc_timeout` option has been added which allows + setting an alternate timeout value for longer-running RPC calls + which are known to take a long time. The default for this is 1800 + seconds, and the `rpc_response_timeout` value will be used for the + heartbeat frequency interval, providing a similar + failure-detection experience for these calls despite the longer + overall timeout. Currently, only the live migration RPC call uses + this longer timeout value. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5dc08840c476..a4034ef5884b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,7 +43,7 @@ oslo.serialization!=2.19.1,>=2.18.0 # Apache-2.0 oslo.utils>=3.33.0 # Apache-2.0 oslo.db>=4.27.0 # Apache-2.0 oslo.rootwrap>=5.8.0 # Apache-2.0 -oslo.messaging>=5.29.0 # Apache-2.0 +oslo.messaging>=6.3.0 # Apache-2.0 oslo.policy>=1.35.0 # Apache-2.0 oslo.privsep>=1.23.0 # Apache-2.0 oslo.i18n>=3.15.3 # Apache-2.0