Add clock sync error detection on agent registration

For the server to determine if an agent is alive or not,
it depends on the agent's clock being mostly in sync with the server
clock. The neutron-server may reject and return the request if
there's a timestamp difference between the two nodes. Currently
there's no good way to detect this condition from the agent code.

This fix will improve the error handling
logic by writing an appropriate log in the neutron server's log
file for an early detection of the problem.

Change-Id: If884f90c4b1786cfc63d3e2ff2d66f92122258c2
Closes-Bug: #1432582
This commit is contained in:
Sudipta Biswas 2015-03-18 23:35:57 +05:30
parent 071fb81d47
commit 2797efc39f
2 changed files with 70 additions and 4 deletions

View File

@ -29,7 +29,7 @@ from neutron.common import constants
from neutron.db import model_base
from neutron.db import models_v2
from neutron.extensions import agent as ext_agent
from neutron.i18n import _LW
from neutron.i18n import _LE, _LW
from neutron import manager
LOG = logging.getLogger(__name__)
@ -272,10 +272,41 @@ class AgentExtRpcCallback(object):
"""Report state from agent to server."""
time = kwargs['time']
time = timeutils.parse_strtime(time)
if self.START_TIME > time:
LOG.debug("Message with invalid timestamp received")
return
agent_state = kwargs['agent_state']['agent_state']
self._check_clock_sync_on_agent_start(agent_state, time)
if self.START_TIME > time:
time_agent = timeutils.isotime(time)
time_server = timeutils.isotime(self.START_TIME)
log_dict = {'agent_time': time_agent, 'server_time': time_server}
LOG.debug("Stale message received with timestamp: %(agent_time)s. "
"Skipping processing because it's older than the "
"server start timestamp: %(server_time)s", log_dict)
return
if not self.plugin:
self.plugin = manager.NeutronManager.get_plugin()
self.plugin.create_or_update_agent(context, agent_state)
def _check_clock_sync_on_agent_start(self, agent_state, agent_time):
"""Checks if the server and the agent times are in sync.
Method checks if the agent time is in sync with the server time
on start up. Ignores it, on subsequent re-connects.
"""
if agent_state.get('start_flag'):
time_server_now = timeutils.utcnow()
diff = abs((time_server_now - agent_time).seconds)
if diff > cfg.CONF.agent_down_time:
agent_name = agent_state['agent_type']
time_agent = timeutils.isotime(agent_time)
host = agent_state['host']
log_dict = {'host': host,
'agent_name': agent_name,
'agent_time': time_agent,
'threshold': cfg.CONF.agent_down_time,
'serv_time': timeutils.isotime(time_server_now)}
LOG.error(_LE("Message received from the host: %(host)s "
"during the registration of %(agent_name)s has "
"a timestamp: %(agent_time)s. This differs from "
"the current server timestamp: %(serv_time)s by "
"more than the threshold agent down"
"time: %(threshold)s."), log_dict)

View File

@ -186,6 +186,41 @@ class AgentSchedulerTestMixIn(object):
agent_state={'agent_state': agent_state},
time=timeutils.strtime())
def test_agent_registration_bad_timestamp(self):
dhcp_hosta = {
'binary': 'neutron-dhcp-agent',
'host': DHCP_HOSTA,
'start_flag': True,
'topic': 'DHCP_AGENT',
'configurations': {'dhcp_driver': 'dhcp_driver',
'use_namespaces': True,
},
'agent_type': constants.AGENT_TYPE_DHCP}
callback = agents_db.AgentExtRpcCallback()
delta_time = datetime.datetime.now() - datetime.timedelta(days=1)
str_time = delta_time.strftime('%Y-%m-%dT%H:%M:%S.%f')
callback.report_state(self.adminContext,
agent_state={'agent_state': dhcp_hosta},
time=str_time)
def test_agent_registration_invalid_timestamp_allowed(self):
dhcp_hosta = {
'binary': 'neutron-dhcp-agent',
'host': DHCP_HOSTA,
'start_flag': True,
'topic': 'DHCP_AGENT',
'configurations': {'dhcp_driver': 'dhcp_driver',
'use_namespaces': True,
},
'agent_type': constants.AGENT_TYPE_DHCP}
callback = agents_db.AgentExtRpcCallback()
utc_time = datetime.datetime.utcnow()
delta_time = utc_time - datetime.timedelta(seconds=10)
str_time = delta_time.strftime('%Y-%m-%dT%H:%M:%S.%f')
callback.report_state(self.adminContext,
agent_state={'agent_state': dhcp_hosta},
time=str_time)
def _disable_agent(self, agent_id, admin_state_up=False):
new_agent = {}
new_agent['agent'] = {}