Retry getting the list of service plugins

On systems that start both neutron-server and neutron-l3-agent together,
there is a chance that the first call to neutron will timeout. Retry upto
4 more times to avoid the l3 agent exiting on startup.

This should make the l3 agent a little more robust on startup but still
not ideal, ideally it wouldn't exit and retry periodically.

Change-Id: I2171a164f3f77bccd89895d73c1c8d67f7190488
Closes-Bug: #1353953
Closes-Bug: #1368152
Closes-Bug: #1368795
This commit is contained in:
Derek Higgins
2014-09-12 16:31:44 +01:00
parent 1f80d73277
commit 0e82e2be0d
2 changed files with 52 additions and 11 deletions

View File

@@ -22,6 +22,7 @@ eventlet.monkey_patch()
import netaddr import netaddr
import os import os
from oslo.config import cfg from oslo.config import cfg
from oslo import messaging
import Queue import Queue
from neutron.agent.common import config from neutron.agent.common import config
@@ -40,6 +41,7 @@ from neutron.common import utils as common_utils
from neutron import context from neutron import context
from neutron import manager from neutron import manager
from neutron.openstack.common import excutils from neutron.openstack.common import excutils
from neutron.openstack.common.gettextutils import _LW
from neutron.openstack.common import importutils from neutron.openstack.common import importutils
from neutron.openstack.common import log as logging from neutron.openstack.common import log as logging
from neutron.openstack.common import loopingcall from neutron.openstack.common import loopingcall
@@ -521,17 +523,35 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback, manager.Manager):
self.sync_progress = False self.sync_progress = False
# Get the list of service plugins from Neutron Server # Get the list of service plugins from Neutron Server
try: # This is the first place where we contact neutron-server on startup
self.neutron_service_plugins = ( # so retry in case its not ready to respond.
self.plugin_rpc.get_service_plugin_list(self.context)) retry_count = 5
except n_rpc.RemoteError as e: while True:
LOG.warning(_('l3-agent cannot check service plugins ' retry_count = retry_count - 1
'enabled at the neutron server when startup ' try:
'due to RPC error. It happens when the server ' self.neutron_service_plugins = (
'does not support this RPC API. If the error ' self.plugin_rpc.get_service_plugin_list(self.context))
'is UnsupportedVersion you can ignore ' except n_rpc.RemoteError as e:
'this warning. Detail message: %s'), e) with excutils.save_and_reraise_exception() as ctx:
self.neutron_service_plugins = None ctx.reraise = False
LOG.warning(_LW('l3-agent cannot check service plugins '
'enabled at the neutron server when '
'startup due to RPC error. It happens '
'when the server does not support this '
'RPC API. If the error is '
'UnsupportedVersion you can ignore this '
'warning. Detail message: %s'), e)
self.neutron_service_plugins = None
except messaging.MessagingTimeout as e:
with excutils.save_and_reraise_exception() as ctx:
if retry_count > 0:
ctx.reraise = False
LOG.warning(_LW('l3-agent cannot check service '
'plugins enabled on the neutron '
'server. Retrying. '
'Detail message: %s'), e)
continue
break
self._clean_stale_namespaces = self.conf.use_namespaces self._clean_stale_namespaces = self.conf.use_namespaces

View File

@@ -20,6 +20,7 @@ import datetime
import mock import mock
import netaddr import netaddr
from oslo.config import cfg from oslo.config import cfg
from oslo import messaging
from testtools import matchers from testtools import matchers
from neutron.agent.common import config as agent_config from neutron.agent.common import config as agent_config
@@ -2140,6 +2141,26 @@ class TestBasicRouterOperations(base.BaseTestCase):
self.assertIsNone(agent.neutron_service_plugins) self.assertIsNone(agent.neutron_service_plugins)
self.assertTrue(self.plugin_api.get_service_plugin_list.called) self.assertTrue(self.plugin_api.get_service_plugin_list.called)
def test_get_service_plugin_list_retried(self):
raise_timeout = messaging.MessagingTimeout()
# Raise a timeout the first 2 times it calls
# get_service_plugin_list then return a empty tuple
self.plugin_api.get_service_plugin_list.side_effect = (
raise_timeout, raise_timeout, tuple()
)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.assertEqual(agent.neutron_service_plugins, tuple())
def test_get_service_plugin_list_retried_max(self):
raise_timeout = messaging.MessagingTimeout()
# Raise a timeout 5 times
self.plugin_api.get_service_plugin_list.side_effect = (
(raise_timeout, ) * 5
)
self.assertRaises(messaging.MessagingTimeout, l3_agent.L3NATAgent,
HOSTNAME, self.conf)
class TestL3AgentEventHandler(base.BaseTestCase): class TestL3AgentEventHandler(base.BaseTestCase):