Resync L3, DHCP and OVS/LB agents upon revival

In big and busy clusters there could be a condition when
rabbitmq clustering mechanism synchronizes queues and during
this period agents connected to that instance of rabbitmq
can't communicate with the server and server considers them
dead moving resources away. After agent become active again,
it needs to cleanup state entries and synchronize its state
with neutron-server.
The solution is to make agents aware of their state from
neutron-server point of view. This is done by changing state
reports from cast to call that would return agent's status.
When agent was dead and becomes alive, it would receive special
AGENT_REVIVED status indicating that it should refresh its
local data which it would not do otherwise.

Closes-Bug: #1505166
Change-Id: Id28248f4f75821fbacf46e2c44e40f27f59172a9
This commit is contained in:
Eugene Nikanorov 2015-10-12 13:59:01 +04:00
parent 0312b1237b
commit 3b6bd917e4
10 changed files with 107 additions and 28 deletions

View File

@ -551,7 +551,6 @@ class DhcpAgentWithStateReport(DhcpAgent):
'start_flag': True,
'agent_type': constants.AGENT_TYPE_DHCP}
report_interval = self.conf.AGENT.report_interval
self.use_call = True
if report_interval:
self.heartbeat = loopingcall.FixedIntervalLoopingCall(
self._report_state)
@ -562,8 +561,12 @@ class DhcpAgentWithStateReport(DhcpAgent):
self.agent_state.get('configurations').update(
self.cache.get_state())
ctx = context.get_admin_context_without_session()
self.state_rpc.report_state(ctx, self.agent_state, self.use_call)
self.use_call = False
agent_status = self.state_rpc.report_state(
ctx, self.agent_state, True)
if agent_status == constants.AGENT_REVIVED:
LOG.info(_LI("Agent has just been revived. "
"Scheduling full sync"))
self.schedule_resync("Agent has just been revived")
except AttributeError:
# This means the server does not support report_state
LOG.warn(_LW("Neutron server does not support state report."

View File

@ -610,7 +610,6 @@ class L3NATAgent(firewall_l3_agent.FWaaSL3AgentRpcCallback,
class L3NATAgentWithStateReport(L3NATAgent):
def __init__(self, host, conf=None):
self.use_call = True
super(L3NATAgentWithStateReport, self).__init__(host=host, conf=conf)
self.state_rpc = agent_rpc.PluginReportStateAPI(topics.REPORTS)
self.agent_state = {
@ -657,10 +656,14 @@ class L3NATAgentWithStateReport(L3NATAgent):
configurations['interfaces'] = num_interfaces
configurations['floating_ips'] = num_floating_ips
try:
self.state_rpc.report_state(self.context, self.agent_state,
self.use_call)
agent_status = self.state_rpc.report_state(self.context,
self.agent_state,
True)
if agent_status == l3_constants.AGENT_REVIVED:
LOG.info(_LI('Agent has just been revived. '
'Doing a full sync.'))
self.fullsync = True
self.agent_state.pop('start_flag', None)
self.use_call = False
except AttributeError:
# This means the server does not support report_state
LOG.warn(_LW("Neutron server does not support state report."

View File

@ -200,3 +200,11 @@ ROUTER_MARK_MASK = "0xffff"
# Time format
ISO8601_TIME_FORMAT = '%Y-%m-%dT%H:%M:%S.%f'
# Agent states as detected by server, used to reply on agent's state report
# agent has just been registered
AGENT_NEW = 'new'
# agent is alive
AGENT_ALIVE = 'alive'
# agent has just returned to alive after being dead
AGENT_REVIVED = 'revived'

View File

@ -299,6 +299,12 @@ class AgentDbMixin(ext_agent.AgentPluginBase, AgentAvailabilityZoneMixin):
'delta': delta})
def _create_or_update_agent(self, context, agent_state):
"""Registers new agent in the database or updates existing.
Returns agent status from server point of view: alive, new or revived.
It could be used by agent to do some sync with the server if needed.
"""
status = constants.AGENT_ALIVE
with context.session.begin(subtransactions=True):
res_keys = ['agent_type', 'binary', 'host', 'topic']
res = dict((k, agent_state[k]) for k in res_keys)
@ -311,6 +317,8 @@ class AgentDbMixin(ext_agent.AgentPluginBase, AgentAvailabilityZoneMixin):
try:
agent_db = self._get_agent_by_type_and_host(
context, agent_state['agent_type'], agent_state['host'])
if not agent_db.is_active:
status = constants.AGENT_REVIVED
res['heartbeat_timestamp'] = current_time
if agent_state.get('start_flag'):
res['started_at'] = current_time
@ -327,7 +335,9 @@ class AgentDbMixin(ext_agent.AgentPluginBase, AgentAvailabilityZoneMixin):
greenthread.sleep(0)
context.session.add(agent_db)
self._log_heartbeat(agent_state, agent_db, configurations_dict)
status = constants.AGENT_NEW
greenthread.sleep(0)
return status
def create_or_update_agent(self, context, agent):
"""Create or update agent according to report."""
@ -367,7 +377,10 @@ class AgentExtRpcCallback(object):
self.plugin = plugin
def report_state(self, context, **kwargs):
"""Report state from agent to server."""
"""Report state from agent to server.
Returns - agent's status: AGENT_NEW, AGENT_REVIVED, AGENT_ALIVE
"""
time = kwargs['time']
time = timeutils.parse_strtime(time)
agent_state = kwargs['agent_state']['agent_state']
@ -382,7 +395,7 @@ class AgentExtRpcCallback(object):
return
if not self.plugin:
self.plugin = manager.NeutronManager.get_plugin()
self.plugin.create_or_update_agent(context, agent_state)
return self.plugin.create_or_update_agent(context, agent_state)
def _check_clock_sync_on_agent_start(self, agent_state, agent_time):
"""Checks if the server and the agent times are in sync.

View File

@ -876,6 +876,8 @@ class LinuxBridgeNeutronAgentRPC(service.Service):
# stores received port_updates for processing by the main loop
self.updated_devices = set()
# flag to do a sync after revival
self.fullsync = False
self.context = context.get_admin_context_without_session()
self.plugin_rpc = agent_rpc.PluginApi(topics.PLUGIN)
self.sg_plugin_rpc = sg_rpc.SecurityGroupServerRpcApi(topics.PLUGIN)
@ -897,8 +899,13 @@ class LinuxBridgeNeutronAgentRPC(service.Service):
try:
devices = len(self.br_mgr.get_tap_devices())
self.agent_state.get('configurations')['devices'] = devices
self.state_rpc.report_state(self.context,
self.agent_state)
agent_status = self.state_rpc.report_state(self.context,
self.agent_state,
True)
if agent_status == constants.AGENT_REVIVED:
LOG.info(_LI('Agent has just been revived. '
'Doing a full sync.'))
self.fullsync = True
self.agent_state.pop('start_flag', None)
except Exception:
LOG.exception(_LE("Failed reporting state!"))
@ -1101,11 +1108,15 @@ class LinuxBridgeNeutronAgentRPC(service.Service):
while True:
start = time.time()
device_info = self.scan_devices(previous=device_info, sync=sync)
if self.fullsync:
sync = True
self.fullsync = False
if sync:
LOG.info(_LI("Agent out of sync with plugin!"))
sync = False
device_info = self.scan_devices(previous=device_info, sync=sync)
sync = False
if (self._device_info_has_changes(device_info)
or self.sg_agent.firewall_refresh_needed()):

View File

@ -182,6 +182,7 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin,
super(OVSNeutronAgent, self).__init__()
self.conf = conf or cfg.CONF
self.fullsync = True
# init bridge classes with configured datapath type.
self.br_int_cls, self.br_phys_cls, self.br_tun_cls = (
functools.partial(bridge_classes[b],
@ -192,7 +193,6 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin,
self.veth_mtu = veth_mtu
self.available_local_vlans = set(moves.range(p_const.MIN_VLAN_TAG,
p_const.MAX_VLAN_TAG))
self.use_call = True
self.tunnel_types = tunnel_types or []
self.l2_pop = l2_population
# TODO(ethuleau): Change ARP responder so it's not dependent on the
@ -325,9 +325,13 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin,
self.dvr_agent.in_distributed_mode())
try:
self.state_rpc.report_state(self.context,
self.agent_state,
self.use_call)
agent_status = self.state_rpc.report_state(self.context,
self.agent_state,
True)
if agent_status == n_const.AGENT_REVIVED:
LOG.info(_LI('Agent has just been revived. '
'Doing a full sync.'))
self.fullsync = True
self.use_call = False
self.agent_state.pop('start_flag', None)
except Exception:
@ -1646,7 +1650,6 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin,
if not polling_manager:
polling_manager = polling.get_polling_manager(
minimize_polling=False)
sync = True
ports = set()
updated_ports_copy = set()
@ -1655,6 +1658,10 @@ class OVSNeutronAgent(sg_rpc.SecurityGroupAgentRpcCallbackMixin,
ovs_restarted = False
consecutive_resyncs = 0
while self._check_and_handle_signal():
if self.fullsync:
LOG.info(_LI("rpc_loop doing a full sync."))
sync = True
self.fullsync = False
port_info = {}
ancillary_port_info = {}
start = time.time()

View File

@ -421,6 +421,20 @@ class TestDhcpAgent(base.BaseTestCase):
dhcp.periodic_resync()
spawn.assert_called_once_with(dhcp._periodic_resync_helper)
def test_report_state_revival_logic(self):
dhcp = dhcp_agent.DhcpAgentWithStateReport(HOSTNAME)
with mock.patch.object(dhcp.state_rpc,
'report_state') as report_state,\
mock.patch.object(dhcp, "run"):
report_state.return_value = const.AGENT_ALIVE
dhcp._report_state()
self.assertEqual(dhcp.needs_resync_reasons, {})
report_state.return_value = const.AGENT_REVIVED
dhcp._report_state()
self.assertEqual(dhcp.needs_resync_reasons[None],
['Agent has just been revived'])
def test_periodic_resync_helper(self):
with mock.patch.object(dhcp_agent.eventlet, 'sleep') as sleep:
dhcp = dhcp_agent.DhcpAgent(HOSTNAME)

View File

@ -197,13 +197,26 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
conf=self.conf)
self.assertTrue(agent.agent_state['start_flag'])
use_call_arg = agent.use_call
agent.after_start()
report_state.assert_called_once_with(agent.context,
agent.agent_state,
use_call_arg)
True)
self.assertIsNone(agent.agent_state.get('start_flag'))
def test_report_state_revival_logic(self):
with mock.patch.object(agent_rpc.PluginReportStateAPI,
'report_state') as report_state:
agent = l3_agent.L3NATAgentWithStateReport(host=HOSTNAME,
conf=self.conf)
report_state.return_value = l3_constants.AGENT_REVIVED
agent._report_state()
self.assertTrue(agent.fullsync)
agent.fullsync = False
report_state.return_value = l3_constants.AGENT_ALIVE
agent._report_state()
self.assertFalse(agent.fullsync)
def test_periodic_sync_routers_task_call_clean_stale_namespaces(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.plugin_api.get_routers.return_value = []

View File

@ -365,6 +365,13 @@ class TestLinuxBridgeAgent(base.BaseTestCase):
self.agent.stop()
self.assertFalse(mock_set_rpc.called)
def test_report_state_revived(self):
with mock.patch.object(self.agent.state_rpc,
"report_state") as report_st:
report_st.return_value = constants.AGENT_REVIVED
self.agent._report_state()
self.assertTrue(self.agent.fullsync)
class TestLinuxBridgeManager(base.BaseTestCase):
def setUp(self):

View File

@ -145,9 +145,6 @@ class TestOvsNeutronAgent(object):
return_value=[]):
self.agent = self.mod_agent.OVSNeutronAgent(self._bridge_classes(),
**kwargs)
# set back to true because initial report state will succeed due
# to mocked out RPC calls
self.agent.use_call = True
self.agent.tun_br = self.br_tun_cls(br_name='br-tun')
self.agent.sg_agent = mock.Mock()
@ -694,14 +691,13 @@ class TestOvsNeutronAgent(object):
report_st.assert_called_with(self.agent.context,
self.agent.agent_state, True)
self.assertNotIn("start_flag", self.agent.agent_state)
self.assertFalse(self.agent.use_call)
self.assertEqual(
self.agent.agent_state["configurations"]["devices"],
self.agent.int_br_device_count
)
self.agent._report_state()
report_st.assert_called_with(self.agent.context,
self.agent.agent_state, False)
self.agent.agent_state, True)
def test_report_state_fail(self):
with mock.patch.object(self.agent.state_rpc,
@ -714,6 +710,13 @@ class TestOvsNeutronAgent(object):
report_st.assert_called_with(self.agent.context,
self.agent.agent_state, True)
def test_report_state_revived(self):
with mock.patch.object(self.agent.state_rpc,
"report_state") as report_st:
report_st.return_value = n_const.AGENT_REVIVED
self.agent._report_state()
self.assertTrue(self.agent.fullsync)
def test_port_update(self):
port = {"id": TEST_PORT_ID1,
"network_id": TEST_NETWORK_ID1,
@ -1733,9 +1736,6 @@ class TestOvsDvrNeutronAgent(object):
return_value=[]):
self.agent = self.mod_agent.OVSNeutronAgent(self._bridge_classes(),
**kwargs)
# set back to true because initial report state will succeed due
# to mocked out RPC calls
self.agent.use_call = True
self.agent.tun_br = self.br_tun_cls(br_name='br-tun')
self.agent.sg_agent = mock.Mock()