Merge "Add request timeout handling for Mellanox Neutron Agent"

This commit is contained in:
Jenkins 2013-12-07 10:05:57 +00:00 committed by Gerrit Code Review
commit e35403879b
7 changed files with 237 additions and 3 deletions

View File

@ -34,12 +34,20 @@
# vnic_type = mlnx_direct
# (StrOpt) Eswitch daemon end point connection url
# daemon_endpoint = 'tcp://127.0.0.1:5001'
# daemon_endpoint = 'tcp://127.0.0.1:60001'
# The number of milliseconds the agent will wait for
# response on request to daemon
# request_timeout = 3000
# The number of retries the agent will send request
# to daemon before giving up
# retries = 3
# The backoff rate multiplier for waiting period between retries
# on request to daemon, i.e. value of 2 will double
# the request timeout each retry
# backoff_rate = 2
[agent]
# Agent's polling interval in seconds

View File

@ -392,6 +392,10 @@ class MlnxEswitchNeutronAgent(sg_rpc.SecurityGroupAgentRpcMixin):
# If treat devices fails - must resync with plugin
sync = self.process_network_ports(port_info)
ports = port_info['current']
except exceptions.RequestTimeout:
LOG.exception(_("Request timeout in agent event loop "
"eSwitchD is not responding - exiting..."))
raise SystemExit(1)
except Exception:
LOG.exception(_("Error in agent event loop"))
sync = True

View File

@ -19,6 +19,7 @@ import zmq
from neutron.openstack.common import jsonutils
from neutron.openstack.common import log as logging
from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
from neutron.plugins.mlnx.common import exceptions
LOG = logging.getLogger(__name__)
@ -42,6 +43,7 @@ class EswitchUtils(object):
self.poller.register(self._conn, zmq.POLLIN)
return self.__conn
@RetryDecorator(exceptions.RequestTimeout)
def send_msg(self, msg):
self._conn.send(msg)
@ -55,7 +57,7 @@ class EswitchUtils(object):
self._conn.close()
self.poller.unregister(self._conn)
self.__conn = None
raise exceptions.MlnxException(_("eSwitchD: Request timeout"))
raise exceptions.RequestTimeout()
def parse_response_msg(self, recv_msg):
msg = jsonutils.loads(recv_msg)
@ -69,7 +71,7 @@ class EswitchUtils(object):
else:
error_msg = _("Unknown operation status %s") % msg['status']
LOG.error(error_msg)
raise exceptions.MlnxException(error_msg)
raise exceptions.OperationFailed(err_msg=error_msg)
def get_attached_vnics(self):
LOG.debug(_("get_attached_vnics"))

View File

@ -0,0 +1,66 @@
# vim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright 2013 Mellanox Technologies, Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from oslo.config import cfg
from neutron.openstack.common import log as logging
from neutron.plugins.mlnx.common import config # noqa
LOG = logging.getLogger(__name__)
class RetryDecorator(object):
"""Retry decorator reruns a method 'retries' times if an exception occurs.
Decorator for retrying a method if exceptionToCheck exception occurs
If method raises exception, retries 'retries' times with increasing
back off period between calls with 'interval' multiplier
:param exceptionToCheck: the exception to check
:param interval: initial delay between retries in seconds
:param retries: number of times to try before giving up
:raises: exceptionToCheck
"""
sleep_fn = time.sleep
def __init__(self, exceptionToCheck,
interval=cfg.CONF.ESWITCH.request_timeout / 1000,
retries=cfg.CONF.ESWITCH.retries,
backoff_rate=cfg.CONF.ESWITCH.backoff_rate):
self.exc = exceptionToCheck
self.interval = interval
self.retries = retries
self.backoff_rate = backoff_rate
def __call__(self, original_func):
def decorated(*args, **kwargs):
sleep_interval = self.interval
num_of_iter = self.retries
while num_of_iter > 0:
try:
return original_func(*args, **kwargs)
except self.exc:
LOG.debug(_("Request timeout - call again after "
"%s seconds"), sleep_interval)
RetryDecorator.sleep_fn(sleep_interval)
num_of_iter -= 1
sleep_interval *= self.backoff_rate
return original_func(*args, **kwargs)
return decorated

View File

@ -48,6 +48,13 @@ eswitch_opts = [
cfg.IntOpt('request_timeout', default=3000,
help=_("The number of milliseconds the agent will wait for "
"response on request to daemon.")),
cfg.IntOpt('retries', default=3,
help=_("The number of retries the agent will send request "
"to daemon before giving up")),
cfg.IntOpt('backoff_rate', default=2,
help=_("backoff rate multiplier for waiting period between "
"retries for request to daemon, i.e. value of 2 will "
" double the request timeout each retry")),
]
agent_opts = [

View File

@ -20,3 +20,11 @@ from neutron.common import exceptions as qexc
class MlnxException(qexc.NeutronException):
message = _("Mlnx Exception: %(err_msg)s")
class RequestTimeout(qexc.NeutronException):
message = _("Request Timeout: no response from eSwitchD")
class OperationFailed(qexc.NeutronException):
message = _("Operation Failed: %(err_msg)s")

View File

@ -0,0 +1,139 @@
# Copyright (c) 2013 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import mock
from oslo.config import cfg
from neutron.plugins.mlnx.common.comm_utils import RetryDecorator
from neutron.plugins.mlnx.common import config # noqa
from neutron.plugins.mlnx.common import exceptions
from neutron.tests import base
class WrongException(Exception):
pass
class TestRetryDecorator(base.BaseTestCase):
def setUp(self):
super(TestRetryDecorator, self).setUp()
self.sleep_fn_p = mock.patch.object(RetryDecorator, 'sleep_fn')
self.sleep_fn = self.sleep_fn_p.start()
self.addCleanup(self.sleep_fn_p.stop)
def test_no_retry_required(self):
self.counter = 0
@RetryDecorator(exceptions.RequestTimeout, interval=2,
retries=3, backoff_rate=2)
def succeeds():
self.counter += 1
return 'success'
ret = succeeds()
self.assertFalse(self.sleep_fn.called)
self.assertEqual(ret, 'success')
self.assertEqual(self.counter, 1)
def test_retry_zero_times(self):
self.counter = 0
interval = 2
backoff_rate = 2
retries = 0
@RetryDecorator(exceptions.RequestTimeout, interval,
retries, backoff_rate)
def always_fails():
self.counter += 1
raise exceptions.RequestTimeout()
self.assertRaises(exceptions.RequestTimeout, always_fails)
self.assertEqual(self.counter, 1)
self.assertFalse(self.sleep_fn.called)
def test_retries_once(self):
self.counter = 0
interval = 2
backoff_rate = 2
retries = 3
@RetryDecorator(exceptions.RequestTimeout, interval,
retries, backoff_rate)
def fails_once():
self.counter += 1
if self.counter < 2:
raise exceptions.RequestTimeout()
else:
return 'success'
ret = fails_once()
self.assertEqual(ret, 'success')
self.assertEqual(self.counter, 2)
self.assertEqual(self.sleep_fn.call_count, 1)
self.sleep_fn.assert_called_with(interval)
def test_limit_is_reached(self):
self.counter = 0
retries = 3
interval = 2
backoff_rate = 4
@RetryDecorator(exceptions.RequestTimeout, interval,
retries, backoff_rate)
def always_fails():
self.counter += 1
raise exceptions.RequestTimeout()
self.assertRaises(exceptions.RequestTimeout, always_fails)
self.assertEqual(self.counter, retries + 1)
self.assertEqual(self.sleep_fn.call_count, retries)
expected_sleep_fn_arg = []
for i in range(retries):
expected_sleep_fn_arg.append(interval)
interval *= backoff_rate
self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
def test_limit_is_reached_with_conf(self):
self.counter = 0
@RetryDecorator(exceptions.RequestTimeout)
def always_fails():
self.counter += 1
raise exceptions.RequestTimeout()
retry = cfg.CONF.ESWITCH.retries
interval = cfg.CONF.ESWITCH.request_timeout / 1000
delay_rate = cfg.CONF.ESWITCH.backoff_rate
expected_sleep_fn_arg = []
for i in range(retry):
expected_sleep_fn_arg.append(interval)
interval *= delay_rate
self.assertRaises(exceptions.RequestTimeout, always_fails)
self.assertEqual(self.counter, retry + 1)
self.assertEqual(self.sleep_fn.call_count, retry)
self.sleep_fn.assert_has_calls(map(mock.call, expected_sleep_fn_arg))
def test_wrong_exception_no_retry(self):
@RetryDecorator(exceptions.RequestTimeout)
def raise_unexpected_error():
raise WrongException("wrong exception")
self.assertRaises(WrongException, raise_unexpected_error)
self.assertFalse(self.sleep_fn.called)