Use oslo.messaging per-call monitoring
This change makes nova configure oslo.messaging's active call monitoring feature if the operator increases the rpc_response_timeout configuration option beyond the default of 60 seconds. If this happens, oslo.messaging will heartbeat actively-running calls to indicate that they are still running, avoiding a false timeout at the shorter interval, while still detecting actual dead-service failures before the longer timeout value. In addition, this adds a long_rpc_timeout configuration option that we can use for known-to-run-long operations separately from the base rpc_response_timeout value, and pre_live_migration() is changed to use this, as it is known to suffer from early false timeouts. Depends-On: Iecb7bef61b3b8145126ead1f74dbaadd7d97b407 Change-Id: Icb0bdc6d4ce4524341e70e737eafcb25f346d197
This commit is contained in:
parent
ef6b4f43f4
commit
fe26a52024
@ -81,7 +81,7 @@ oslo.context==2.19.2
|
||||
oslo.db==4.27.0
|
||||
oslo.i18n==3.15.3
|
||||
oslo.log==3.36.0
|
||||
oslo.messaging==5.29.0
|
||||
oslo.messaging==6.3.0
|
||||
oslo.middleware==3.31.0
|
||||
oslo.policy==1.35.0
|
||||
oslo.privsep==1.23.0
|
||||
|
@ -371,6 +371,11 @@ class ComputeAPI(object):
|
||||
version_cap = self.VERSION_ALIASES.get(upgrade_level,
|
||||
upgrade_level)
|
||||
serializer = objects_base.NovaObjectSerializer()
|
||||
|
||||
# NOTE(danms): We need to poke this path to register CONF options
|
||||
# that we use in self.get_client()
|
||||
rpc.get_client(target, version_cap, serializer)
|
||||
|
||||
default_client = self.get_client(target, version_cap, serializer)
|
||||
self.router = rpc.ClientRouter(default_client)
|
||||
|
||||
@ -417,9 +422,20 @@ class ComputeAPI(object):
|
||||
|
||||
# Cells overrides this
|
||||
def get_client(self, target, version_cap, serializer):
|
||||
if CONF.rpc_response_timeout > rpc.HEARTBEAT_THRESHOLD:
|
||||
# NOTE(danms): If the operator has overridden RPC timeout
|
||||
# to be longer than rpc.HEARTBEAT_THRESHOLD then configure
|
||||
# the call monitor timeout to be the threshold to keep the
|
||||
# failure timing characteristics that our code likely
|
||||
# expects (from history) while allowing healthy calls
|
||||
# to run longer.
|
||||
cmt = rpc.HEARTBEAT_THRESHOLD
|
||||
else:
|
||||
cmt = None
|
||||
return rpc.get_client(target,
|
||||
version_cap=version_cap,
|
||||
serializer=serializer)
|
||||
serializer=serializer,
|
||||
call_monitor_timeout=cmt)
|
||||
|
||||
def add_aggregate_host(self, ctxt, host, aggregate, host_param,
|
||||
slave_info=None):
|
||||
@ -684,7 +700,9 @@ class ComputeAPI(object):
|
||||
host, migrate_data):
|
||||
version = '5.0'
|
||||
client = self.router.client(ctxt)
|
||||
cctxt = client.prepare(server=host, version=version)
|
||||
cctxt = client.prepare(server=host, version=version,
|
||||
timeout=CONF.long_rpc_timeout,
|
||||
call_monitor_timeout=CONF.rpc_response_timeout)
|
||||
return cctxt.call(ctxt, 'pre_live_migration',
|
||||
instance=instance,
|
||||
block_migration=block_migration,
|
||||
|
@ -55,6 +55,7 @@ from nova.conf import powervm
|
||||
from nova.conf import quota
|
||||
from nova.conf import rdp
|
||||
from nova.conf import remote_debug
|
||||
from nova.conf import rpc
|
||||
from nova.conf import scheduler
|
||||
from nova.conf import serial_console
|
||||
from nova.conf import service
|
||||
@ -107,6 +108,7 @@ placement.register_opts(CONF)
|
||||
powervm.register_opts(CONF)
|
||||
quota.register_opts(CONF)
|
||||
rdp.register_opts(CONF)
|
||||
rpc.register_opts(CONF)
|
||||
scheduler.register_opts(CONF)
|
||||
serial_console.register_opts(CONF)
|
||||
service.register_opts(CONF)
|
||||
|
46
nova/conf/rpc.py
Normal file
46
nova/conf/rpc.py
Normal file
@ -0,0 +1,46 @@
|
||||
# Copyright 2018 OpenStack Foundation
|
||||
# All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from oslo_config import cfg
|
||||
|
||||
rpc_opts = [
|
||||
cfg.IntOpt("long_rpc_timeout",
|
||||
default=1800,
|
||||
help="""
|
||||
This option allows setting an alternate timeout value for RPC calls
|
||||
that have the potential to take a long time. If set, RPC calls to
|
||||
other services will use this value for the timeout (in seconds)
|
||||
instead of the global rpc_response_timeout value.
|
||||
|
||||
Operations with RPC calls that utilize this value:
|
||||
|
||||
* live migration
|
||||
|
||||
Related options:
|
||||
|
||||
* rpc_response_timeout
|
||||
"""),
|
||||
]
|
||||
|
||||
|
||||
ALL_OPTS = rpc_opts
|
||||
|
||||
|
||||
def register_opts(conf):
|
||||
conf.register_opts(ALL_OPTS)
|
||||
|
||||
|
||||
def list_opts():
|
||||
return {'DEFAULT': ALL_OPTS}
|
14
nova/rpc.py
14
nova/rpc.py
@ -51,6 +51,10 @@ LEGACY_NOTIFIER = None
|
||||
NOTIFICATION_TRANSPORT = None
|
||||
NOTIFIER = None
|
||||
|
||||
# NOTE(danms): If rpc_response_timeout is over this value (per-call or
|
||||
# globally), we will enable heartbeating
|
||||
HEARTBEAT_THRESHOLD = 60
|
||||
|
||||
ALLOWED_EXMODS = [
|
||||
nova.exception.__name__,
|
||||
]
|
||||
@ -172,7 +176,8 @@ def get_transport_url(url_str=None):
|
||||
return messaging.TransportURL.parse(CONF, url_str)
|
||||
|
||||
|
||||
def get_client(target, version_cap=None, serializer=None):
|
||||
def get_client(target, version_cap=None, serializer=None,
|
||||
call_monitor_timeout=None):
|
||||
assert TRANSPORT is not None
|
||||
|
||||
if profiler:
|
||||
@ -183,7 +188,8 @@ def get_client(target, version_cap=None, serializer=None):
|
||||
return messaging.RPCClient(TRANSPORT,
|
||||
target,
|
||||
version_cap=version_cap,
|
||||
serializer=serializer)
|
||||
serializer=serializer,
|
||||
call_monitor_timeout=call_monitor_timeout)
|
||||
|
||||
|
||||
def get_server(target, endpoints, serializer=None):
|
||||
@ -413,8 +419,10 @@ class ClientRouter(periodic_task.PeriodicTasks):
|
||||
def client(self, context):
|
||||
transport = context.mq_connection
|
||||
if transport:
|
||||
cmt = self.default_client.call_monitor_timeout
|
||||
return messaging.RPCClient(transport, self.target,
|
||||
version_cap=self.version_cap,
|
||||
serializer=self.serializer)
|
||||
serializer=self.serializer,
|
||||
call_monitor_timeout=cmt)
|
||||
else:
|
||||
return self.default_client
|
||||
|
@ -484,13 +484,15 @@ class CellDatabases(fixtures.Fixture):
|
||||
executor='eventlet',
|
||||
serializer=serializer)
|
||||
|
||||
def _wrap_get_client(self, target, version_cap=None, serializer=None):
|
||||
def _wrap_get_client(self, target, version_cap=None, serializer=None,
|
||||
call_monitor_timeout=None):
|
||||
"""Mirror rpc.get_client() but with our special sauce."""
|
||||
serializer = CheatingSerializer(serializer)
|
||||
return messaging.RPCClient(rpc.TRANSPORT,
|
||||
target,
|
||||
version_cap=version_cap,
|
||||
serializer=serializer)
|
||||
serializer=serializer,
|
||||
call_monitor_timeout=call_monitor_timeout)
|
||||
|
||||
def add_cell_database(self, connection_str, default=False):
|
||||
"""Add a cell database to the fixture.
|
||||
|
@ -122,6 +122,14 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
base_version = rpcapi.router.target.version
|
||||
expected_version = kwargs.pop('version', base_version)
|
||||
|
||||
prepare_extra_kwargs = {}
|
||||
cm_timeout = kwargs.pop('call_monitor_timeout', None)
|
||||
timeout = kwargs.pop('timeout', None)
|
||||
if cm_timeout:
|
||||
prepare_extra_kwargs['call_monitor_timeout'] = cm_timeout
|
||||
if timeout:
|
||||
prepare_extra_kwargs['timeout'] = timeout
|
||||
|
||||
expected_kwargs = kwargs.copy()
|
||||
if expected_args:
|
||||
expected_kwargs.update(expected_args)
|
||||
@ -170,7 +178,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
self.assertEqual(retval, rpc_mock.return_value)
|
||||
|
||||
prepare_mock.assert_called_once_with(version=expected_version,
|
||||
server=host)
|
||||
server=host,
|
||||
**prepare_extra_kwargs)
|
||||
rpc_mock.assert_called_once_with(ctxt, method, **expected_kwargs)
|
||||
|
||||
def test_add_aggregate_host(self):
|
||||
@ -354,10 +363,12 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
|
||||
instance=self.fake_instance_obj, version='5.0')
|
||||
|
||||
def test_pre_live_migration(self):
|
||||
self.flags(long_rpc_timeout=1234)
|
||||
self._test_compute_api('pre_live_migration', 'call',
|
||||
instance=self.fake_instance_obj,
|
||||
block_migration='block_migration', disk='disk', host='host',
|
||||
migrate_data=None, version='5.0')
|
||||
migrate_data=None, version='5.0',
|
||||
call_monitor_timeout=60, timeout=1234)
|
||||
|
||||
def test_prep_resize(self):
|
||||
expected_args = {'migration': 'migration'}
|
||||
|
@ -208,6 +208,7 @@ class TestRPC(test.NoDBTestCase):
|
||||
mock_ser.assert_called_once_with('foo')
|
||||
mock_client.assert_called_once_with(rpc.TRANSPORT,
|
||||
tgt, version_cap='1.0',
|
||||
call_monitor_timeout=None,
|
||||
serializer=ser)
|
||||
self.assertEqual('client', client)
|
||||
|
||||
@ -246,6 +247,7 @@ class TestRPC(test.NoDBTestCase):
|
||||
mock_ser.assert_called_once_with('foo')
|
||||
mock_client.assert_called_once_with(rpc.TRANSPORT,
|
||||
tgt, version_cap='1.0',
|
||||
call_monitor_timeout=None,
|
||||
serializer=ser)
|
||||
self.assertEqual('client', client)
|
||||
|
||||
@ -472,6 +474,7 @@ class TestClientRouter(test.NoDBTestCase):
|
||||
mock_rpcclient.assert_called_once_with(
|
||||
mock.sentinel.transport, default_client.target,
|
||||
version_cap=default_client.version_cap,
|
||||
call_monitor_timeout=default_client.call_monitor_timeout,
|
||||
serializer=default_client.serializer)
|
||||
# verify cell client was returned
|
||||
self.assertEqual(cell_client, client)
|
||||
|
21
releasenotes/notes/rpc_timeout_changes-6b7e365bb44f7f3a.yaml
Normal file
21
releasenotes/notes/rpc_timeout_changes-6b7e365bb44f7f3a.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
Utilizing recent changes in oslo.messaging, the
|
||||
`rpc_response_timeout` value can now be increased significantly if
|
||||
needed or desired to solve issues with long-running RPC calls
|
||||
timing out before completing due to legitimate reasons (such as
|
||||
live migration prep). If `rpc_response_timeout` is increased
|
||||
beyond the default, nova will request active call monitoring from
|
||||
oslo.messaging, which will effectively heartbeat running
|
||||
activities to avoid a timeout, while still detecting failures
|
||||
related to service outages or message bus congestion in a
|
||||
reasonable amount of time. Further, the
|
||||
`[DEFAULT]/long_rpc_timeout` option has been added which allows
|
||||
setting an alternate timeout value for longer-running RPC calls
|
||||
which are known to take a long time. The default for this is 1800
|
||||
seconds, and the `rpc_response_timeout` value will be used for the
|
||||
heartbeat frequency interval, providing a similar
|
||||
failure-detection experience for these calls despite the longer
|
||||
overall timeout. Currently, only the live migration RPC call uses
|
||||
this longer timeout value.
|
@ -43,7 +43,7 @@ oslo.serialization!=2.19.1,>=2.18.0 # Apache-2.0
|
||||
oslo.utils>=3.33.0 # Apache-2.0
|
||||
oslo.db>=4.27.0 # Apache-2.0
|
||||
oslo.rootwrap>=5.8.0 # Apache-2.0
|
||||
oslo.messaging>=5.29.0 # Apache-2.0
|
||||
oslo.messaging>=6.3.0 # Apache-2.0
|
||||
oslo.policy>=1.35.0 # Apache-2.0
|
||||
oslo.privsep>=1.23.0 # Apache-2.0
|
||||
oslo.i18n>=3.15.3 # Apache-2.0
|
||||
|
Loading…
Reference in New Issue
Block a user