Use oslo.messaging per-call monitoring

This change makes nova configure oslo.messaging's active call monitoring
feature if the operator increases the rpc_response_timeout configuration
option beyond the default of 60 seconds. If this happens, oslo.messaging will
heartbeat actively-running calls to indicate that they are still running,
avoiding a false timeout at the shorter interval, while still detecting
actual dead-service failures before the longer timeout value.

In addition, this adds a long_rpc_timeout configuration option that we
can use for known-to-run-long operations separately from the base
rpc_response_timeout value, and pre_live_migration() is changed to use
this, as it is known to suffer from early false timeouts.

Depends-On: Iecb7bef61b3b8145126ead1f74dbaadd7d97b407
Change-Id: Icb0bdc6d4ce4524341e70e737eafcb25f346d197
This commit is contained in:
Dan Smith 2018-05-07 09:00:00 -07:00
parent ef6b4f43f4
commit fe26a52024
10 changed files with 122 additions and 11 deletions

View File

@ -81,7 +81,7 @@ oslo.context==2.19.2
oslo.db==4.27.0
oslo.i18n==3.15.3
oslo.log==3.36.0
oslo.messaging==5.29.0
oslo.messaging==6.3.0
oslo.middleware==3.31.0
oslo.policy==1.35.0
oslo.privsep==1.23.0

View File

@ -371,6 +371,11 @@ class ComputeAPI(object):
version_cap = self.VERSION_ALIASES.get(upgrade_level,
upgrade_level)
serializer = objects_base.NovaObjectSerializer()
# NOTE(danms): We need to poke this path to register CONF options
# that we use in self.get_client()
rpc.get_client(target, version_cap, serializer)
default_client = self.get_client(target, version_cap, serializer)
self.router = rpc.ClientRouter(default_client)
@ -417,9 +422,20 @@ class ComputeAPI(object):
# Cells overrides this
def get_client(self, target, version_cap, serializer):
if CONF.rpc_response_timeout > rpc.HEARTBEAT_THRESHOLD:
# NOTE(danms): If the operator has overridden RPC timeout
# to be longer than rpc.HEARTBEAT_THRESHOLD then configure
# the call monitor timeout to be the threshold to keep the
# failure timing characteristics that our code likely
# expects (from history) while allowing healthy calls
# to run longer.
cmt = rpc.HEARTBEAT_THRESHOLD
else:
cmt = None
return rpc.get_client(target,
version_cap=version_cap,
serializer=serializer)
serializer=serializer,
call_monitor_timeout=cmt)
def add_aggregate_host(self, ctxt, host, aggregate, host_param,
slave_info=None):
@ -684,7 +700,9 @@ class ComputeAPI(object):
host, migrate_data):
version = '5.0'
client = self.router.client(ctxt)
cctxt = client.prepare(server=host, version=version)
cctxt = client.prepare(server=host, version=version,
timeout=CONF.long_rpc_timeout,
call_monitor_timeout=CONF.rpc_response_timeout)
return cctxt.call(ctxt, 'pre_live_migration',
instance=instance,
block_migration=block_migration,

View File

@ -55,6 +55,7 @@ from nova.conf import powervm
from nova.conf import quota
from nova.conf import rdp
from nova.conf import remote_debug
from nova.conf import rpc
from nova.conf import scheduler
from nova.conf import serial_console
from nova.conf import service
@ -107,6 +108,7 @@ placement.register_opts(CONF)
powervm.register_opts(CONF)
quota.register_opts(CONF)
rdp.register_opts(CONF)
rpc.register_opts(CONF)
scheduler.register_opts(CONF)
serial_console.register_opts(CONF)
service.register_opts(CONF)

46
nova/conf/rpc.py Normal file
View File

@ -0,0 +1,46 @@
# Copyright 2018 OpenStack Foundation
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
from oslo_config import cfg
rpc_opts = [
cfg.IntOpt("long_rpc_timeout",
default=1800,
help="""
This option allows setting an alternate timeout value for RPC calls
that have the potential to take a long time. If set, RPC calls to
other services will use this value for the timeout (in seconds)
instead of the global rpc_response_timeout value.
Operations with RPC calls that utilize this value:
* live migration
Related options:
* rpc_response_timeout
"""),
]
ALL_OPTS = rpc_opts
def register_opts(conf):
conf.register_opts(ALL_OPTS)
def list_opts():
return {'DEFAULT': ALL_OPTS}

View File

@ -51,6 +51,10 @@ LEGACY_NOTIFIER = None
NOTIFICATION_TRANSPORT = None
NOTIFIER = None
# NOTE(danms): If rpc_response_timeout is over this value (per-call or
# globally), we will enable heartbeating
HEARTBEAT_THRESHOLD = 60
ALLOWED_EXMODS = [
nova.exception.__name__,
]
@ -172,7 +176,8 @@ def get_transport_url(url_str=None):
return messaging.TransportURL.parse(CONF, url_str)
def get_client(target, version_cap=None, serializer=None):
def get_client(target, version_cap=None, serializer=None,
call_monitor_timeout=None):
assert TRANSPORT is not None
if profiler:
@ -183,7 +188,8 @@ def get_client(target, version_cap=None, serializer=None):
return messaging.RPCClient(TRANSPORT,
target,
version_cap=version_cap,
serializer=serializer)
serializer=serializer,
call_monitor_timeout=call_monitor_timeout)
def get_server(target, endpoints, serializer=None):
@ -413,8 +419,10 @@ class ClientRouter(periodic_task.PeriodicTasks):
def client(self, context):
transport = context.mq_connection
if transport:
cmt = self.default_client.call_monitor_timeout
return messaging.RPCClient(transport, self.target,
version_cap=self.version_cap,
serializer=self.serializer)
serializer=self.serializer,
call_monitor_timeout=cmt)
else:
return self.default_client

View File

@ -484,13 +484,15 @@ class CellDatabases(fixtures.Fixture):
executor='eventlet',
serializer=serializer)
def _wrap_get_client(self, target, version_cap=None, serializer=None):
def _wrap_get_client(self, target, version_cap=None, serializer=None,
call_monitor_timeout=None):
"""Mirror rpc.get_client() but with our special sauce."""
serializer = CheatingSerializer(serializer)
return messaging.RPCClient(rpc.TRANSPORT,
target,
version_cap=version_cap,
serializer=serializer)
serializer=serializer,
call_monitor_timeout=call_monitor_timeout)
def add_cell_database(self, connection_str, default=False):
"""Add a cell database to the fixture.

View File

@ -122,6 +122,14 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
base_version = rpcapi.router.target.version
expected_version = kwargs.pop('version', base_version)
prepare_extra_kwargs = {}
cm_timeout = kwargs.pop('call_monitor_timeout', None)
timeout = kwargs.pop('timeout', None)
if cm_timeout:
prepare_extra_kwargs['call_monitor_timeout'] = cm_timeout
if timeout:
prepare_extra_kwargs['timeout'] = timeout
expected_kwargs = kwargs.copy()
if expected_args:
expected_kwargs.update(expected_args)
@ -170,7 +178,8 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
self.assertEqual(retval, rpc_mock.return_value)
prepare_mock.assert_called_once_with(version=expected_version,
server=host)
server=host,
**prepare_extra_kwargs)
rpc_mock.assert_called_once_with(ctxt, method, **expected_kwargs)
def test_add_aggregate_host(self):
@ -354,10 +363,12 @@ class ComputeRpcAPITestCase(test.NoDBTestCase):
instance=self.fake_instance_obj, version='5.0')
def test_pre_live_migration(self):
self.flags(long_rpc_timeout=1234)
self._test_compute_api('pre_live_migration', 'call',
instance=self.fake_instance_obj,
block_migration='block_migration', disk='disk', host='host',
migrate_data=None, version='5.0')
migrate_data=None, version='5.0',
call_monitor_timeout=60, timeout=1234)
def test_prep_resize(self):
expected_args = {'migration': 'migration'}

View File

@ -208,6 +208,7 @@ class TestRPC(test.NoDBTestCase):
mock_ser.assert_called_once_with('foo')
mock_client.assert_called_once_with(rpc.TRANSPORT,
tgt, version_cap='1.0',
call_monitor_timeout=None,
serializer=ser)
self.assertEqual('client', client)
@ -246,6 +247,7 @@ class TestRPC(test.NoDBTestCase):
mock_ser.assert_called_once_with('foo')
mock_client.assert_called_once_with(rpc.TRANSPORT,
tgt, version_cap='1.0',
call_monitor_timeout=None,
serializer=ser)
self.assertEqual('client', client)
@ -472,6 +474,7 @@ class TestClientRouter(test.NoDBTestCase):
mock_rpcclient.assert_called_once_with(
mock.sentinel.transport, default_client.target,
version_cap=default_client.version_cap,
call_monitor_timeout=default_client.call_monitor_timeout,
serializer=default_client.serializer)
# verify cell client was returned
self.assertEqual(cell_client, client)

View File

@ -0,0 +1,21 @@
---
features:
- |
Utilizing recent changes in oslo.messaging, the
`rpc_response_timeout` value can now be increased significantly if
needed or desired to solve issues with long-running RPC calls
timing out before completing due to legitimate reasons (such as
live migration prep). If `rpc_response_timeout` is increased
beyond the default, nova will request active call monitoring from
oslo.messaging, which will effectively heartbeat running
activities to avoid a timeout, while still detecting failures
related to service outages or message bus congestion in a
reasonable amount of time. Further, the
`[DEFAULT]/long_rpc_timeout` option has been added which allows
setting an alternate timeout value for longer-running RPC calls
which are known to take a long time. The default for this is 1800
seconds, and the `rpc_response_timeout` value will be used for the
heartbeat frequency interval, providing a similar
failure-detection experience for these calls despite the longer
overall timeout. Currently, only the live migration RPC call uses
this longer timeout value.

View File

@ -43,7 +43,7 @@ oslo.serialization!=2.19.1,>=2.18.0 # Apache-2.0
oslo.utils>=3.33.0 # Apache-2.0
oslo.db>=4.27.0 # Apache-2.0
oslo.rootwrap>=5.8.0 # Apache-2.0
oslo.messaging>=5.29.0 # Apache-2.0
oslo.messaging>=6.3.0 # Apache-2.0
oslo.policy>=1.35.0 # Apache-2.0
oslo.privsep>=1.23.0 # Apache-2.0
oslo.i18n>=3.15.3 # Apache-2.0