Not set the HA port down at regular l3-agent restart
If l3-agent was restarted by a regular action, such as config change, package upgrade, manually service restart etc. We should not set the HA port down during such scenarios. Unless the physical host was rebooted, aka the VRRP processes were all terminated. This patch adds a new RPC call during l3 agent init, it will try to retrieve the HA router count first. And then compare the VRRP process (keepalived) count and 'neutron-keepalived-state-change' count with the hosting router count. If the count matches, then that set HA port to 'DOWN' state action will not be triggered anymore. Closes-Bug: #1798475 Change-Id: I5e2bb64df0aaab11a640a798963372c8d91a06a8
This commit is contained in:
parent
2f3cc51784
commit
5b7d444b31
|
@ -51,6 +51,7 @@ from neutron.agent.l3 import namespace_manager
|
|||
from neutron.agent.linux import external_process
|
||||
from neutron.agent.linux import ip_lib
|
||||
from neutron.agent.linux import pd
|
||||
from neutron.agent.linux import utils as linux_utils
|
||||
from neutron.agent.metadata import driver as metadata_driver
|
||||
from neutron.agent import rpc as agent_rpc
|
||||
from neutron.common import constants as l3_constants
|
||||
|
@ -109,6 +110,7 @@ class L3PluginApi(object):
|
|||
1.8 - Added address scope information
|
||||
1.9 - Added get_router_ids
|
||||
1.10 Added update_all_ha_network_port_statuses
|
||||
1.11 Added get_host_ha_router_count
|
||||
"""
|
||||
|
||||
def __init__(self, topic, host):
|
||||
|
@ -184,6 +186,11 @@ class L3PluginApi(object):
|
|||
return cctxt.call(context, 'delete_agent_gateway_port',
|
||||
host=self.host, network_id=fip_net)
|
||||
|
||||
def get_host_ha_router_count(self, context):
|
||||
"""Make a call to get the count of HA router."""
|
||||
cctxt = self.client.prepare(version='1.11')
|
||||
return cctxt.call(context, 'get_host_ha_router_count', host=self.host)
|
||||
|
||||
|
||||
@profiler.trace_cls("l3-agent")
|
||||
class L3NATAgent(ha.AgentMixin,
|
||||
|
@ -228,21 +235,22 @@ class L3NATAgent(ha.AgentMixin,
|
|||
self.fullsync = True
|
||||
self.sync_routers_chunk_size = SYNC_ROUTERS_MAX_CHUNK_SIZE
|
||||
|
||||
# Get the list of service plugins from Neutron Server
|
||||
# Get the HA router count from Neutron Server
|
||||
# This is the first place where we contact neutron-server on startup
|
||||
# so retry in case its not ready to respond.
|
||||
while True:
|
||||
try:
|
||||
self.neutron_service_plugins = (
|
||||
self.plugin_rpc.get_service_plugin_list(self.context))
|
||||
self.ha_router_count = int(
|
||||
self.plugin_rpc.get_host_ha_router_count(self.context))
|
||||
except oslo_messaging.MessagingTimeout as e:
|
||||
LOG.warning('l3-agent cannot contact neutron server '
|
||||
'to retrieve service plugins enabled. '
|
||||
'to retrieve HA router count. '
|
||||
'Check connectivity to neutron server. '
|
||||
'Retrying... '
|
||||
'Detailed message: %(msg)s.', {'msg': e})
|
||||
continue
|
||||
break
|
||||
LOG.info("Agent HA routers count %s", self.ha_router_count)
|
||||
|
||||
self.init_extension_manager(self.plugin_rpc)
|
||||
|
||||
|
@ -271,13 +279,49 @@ class L3NATAgent(ha.AgentMixin,
|
|||
consumers = [[topics.NETWORK, topics.UPDATE]]
|
||||
agent_rpc.create_consumers([self], topics.AGENT, consumers)
|
||||
|
||||
# We set HA network port status to DOWN to let l2 agent update it
|
||||
# to ACTIVE after wiring. This allows us to spawn keepalived only
|
||||
# when l2 agent finished wiring the port.
|
||||
try:
|
||||
self.plugin_rpc.update_all_ha_network_port_statuses(self.context)
|
||||
except Exception:
|
||||
LOG.exception('update_all_ha_network_port_statuses failed')
|
||||
self._check_ha_router_process_status()
|
||||
|
||||
def _check_ha_router_process_status(self):
|
||||
"""Check HA router VRRP process status in network node.
|
||||
|
||||
Check if the HA router HA routers VRRP (keepalived) process count
|
||||
and state change python monitor process count meet the expected
|
||||
quantity. If so, l3-agent will not call neutron to set all related
|
||||
HA port to down state, this can prevent some unexpected VRRP
|
||||
re-election. If not, a physical host may have down and just
|
||||
restarted, set HA network port status to DOWN.
|
||||
"""
|
||||
if (self.conf.agent_mode not in [lib_const.L3_AGENT_MODE_DVR_SNAT,
|
||||
lib_const.L3_AGENT_MODE_LEGACY]):
|
||||
return
|
||||
|
||||
if self.ha_router_count <= 0:
|
||||
return
|
||||
|
||||
# HA routers VRRP (keepalived) process count
|
||||
vrrp_pcount = linux_utils.get_process_count_by_name("keepalived")
|
||||
LOG.debug("VRRP process count %s.", vrrp_pcount)
|
||||
# HA routers state change python monitor process count
|
||||
vrrp_st_pcount = linux_utils.get_process_count_by_name(
|
||||
"neutron-keepalived-state-change")
|
||||
LOG.debug("neutron-keepalived-state-change process count %s.",
|
||||
vrrp_st_pcount)
|
||||
|
||||
# Due to the process structure design of keepalived and the current
|
||||
# config of l3-ha router, it will run one main 'keepalived' process
|
||||
# and a child 'VRRP' process. So in the following check, we divided
|
||||
# number of processes by 2 to match the ha router count.
|
||||
if (not (vrrp_pcount / 2 >= self.ha_router_count and
|
||||
vrrp_st_pcount >= self.ha_router_count)):
|
||||
LOG.debug("Call neutron server to set HA port to DOWN state.")
|
||||
try:
|
||||
# We set HA network port status to DOWN to let l2 agent
|
||||
# update it to ACTIVE after wiring. This allows us to spawn
|
||||
# keepalived only when l2 agent finished wiring the port.
|
||||
self.plugin_rpc.update_all_ha_network_port_statuses(
|
||||
self.context)
|
||||
except Exception:
|
||||
LOG.exception('update_all_ha_network_port_statuses failed')
|
||||
|
||||
def _check_config_params(self):
|
||||
"""Check items in configuration files.
|
||||
|
|
|
@ -198,6 +198,17 @@ def find_parent_pid(pid):
|
|||
return ppid.strip()
|
||||
|
||||
|
||||
def get_process_count_by_name(name):
|
||||
"""Find the process count by name."""
|
||||
try:
|
||||
out = execute(['ps', '-C', name, '-o', 'comm='],
|
||||
log_fail_as_error=False)
|
||||
except exceptions.ProcessExecutionError:
|
||||
with excutils.save_and_reraise_exception(reraise=False):
|
||||
return 0
|
||||
return len(out.strip('\n').split('\n'))
|
||||
|
||||
|
||||
def find_fork_top_parent(pid):
|
||||
"""Retrieve the pid of the top parent of the given pid through a fork.
|
||||
|
||||
|
|
|
@ -47,7 +47,8 @@ class L3RpcCallback(object):
|
|||
# 1.8 Added address scope information
|
||||
# 1.9 Added get_router_ids
|
||||
# 1.10 Added update_all_ha_network_port_statuses
|
||||
target = oslo_messaging.Target(version='1.10')
|
||||
# 1.11 Added get_host_ha_router_count
|
||||
target = oslo_messaging.Target(version='1.11')
|
||||
|
||||
@property
|
||||
def plugin(self):
|
||||
|
@ -250,6 +251,9 @@ class L3RpcCallback(object):
|
|||
def get_service_plugin_list(self, context, **kwargs):
|
||||
return directory.get_plugins().keys()
|
||||
|
||||
def get_host_ha_router_count(self, context, host):
|
||||
return self.l3plugin.get_host_ha_router_count(context, host)
|
||||
|
||||
@db_api.retry_db_errors
|
||||
def update_floatingip_statuses(self, context, router_id, fip_statuses):
|
||||
"""Update operational status for a floating IP."""
|
||||
|
|
|
@ -313,6 +313,13 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
|
|||
return []
|
||||
return self._get_router_ids_for_agent(context, agent, router_ids)
|
||||
|
||||
def get_host_ha_router_count(self, context, host):
|
||||
router_ids = self.list_router_ids_on_host(context, host)
|
||||
up_routers = l3_objs.Router.get_objects(context, id=router_ids,
|
||||
admin_state_up=True)
|
||||
return len(l3_objs.RouterExtraAttributes.get_objects(
|
||||
context, router_id=[obj.id for obj in up_routers], ha=True))
|
||||
|
||||
def _get_router_ids_for_agent(self, context, agent, router_ids):
|
||||
"""Get IDs of routers that the agent should host
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@ from neutron_lib.api.definitions import portbindings
|
|||
from neutron_lib import constants as lib_constants
|
||||
from neutron_lib import exceptions as exc
|
||||
from neutron_lib.exceptions import l3 as l3_exc
|
||||
from neutron_lib.plugins import constants as plugin_constants
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log
|
||||
import oslo_messaging
|
||||
|
@ -2855,23 +2854,22 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
|
|||
ri.process_address_scope()
|
||||
self.assertEqual(2, mocked_func.call_count)
|
||||
|
||||
def test_get_service_plugin_list(self):
|
||||
service_plugins = [plugin_constants.L3]
|
||||
self.plugin_api.get_service_plugin_list.return_value = service_plugins
|
||||
def test_get_host_ha_router_count(self):
|
||||
self.plugin_api.get_host_ha_router_count.return_value = 1
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
self.assertEqual(service_plugins, agent.neutron_service_plugins)
|
||||
self.assertTrue(self.plugin_api.get_service_plugin_list.called)
|
||||
self.assertEqual(1, agent.ha_router_count)
|
||||
self.assertTrue(self.plugin_api.get_host_ha_router_count.called)
|
||||
|
||||
def test_get_service_plugin_list_retried(self):
|
||||
def test_get_host_ha_router_count_retried(self):
|
||||
raise_timeout = oslo_messaging.MessagingTimeout()
|
||||
# Raise a timeout the first 2 times it calls
|
||||
# get_service_plugin_list then return a empty tuple
|
||||
self.plugin_api.get_service_plugin_list.side_effect = (
|
||||
raise_timeout, tuple()
|
||||
# get_host_ha_router_count then return 0
|
||||
self.plugin_api.get_host_ha_router_count.side_effect = (
|
||||
raise_timeout, 0
|
||||
)
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
|
||||
self.assertEqual(tuple(), agent.neutron_service_plugins)
|
||||
self.assertEqual(0, agent.ha_router_count)
|
||||
|
||||
def test_external_gateway_removed_ext_gw_port_no_fip_ns(self):
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
|
|
Loading…
Reference in New Issue