Not set the HA port down at regular l3-agent restart

If l3-agent was restarted by a regular action, such as config change,
package upgrade, manually service restart etc. We should not set the
HA port down during such scenarios. Unless the physical host was
rebooted, aka the VRRP processes were all terminated.

This patch adds a new RPC call during l3 agent init, it will try to
retrieve the HA router count first. And then compare the VRRP process
(keepalived) count and 'neutron-keepalived-state-change' count
with the hosting router count. If the count matches, then that
set HA port to 'DOWN' state action will not be triggered anymore.

Closes-Bug: #1798475
Change-Id: I5e2bb64df0aaab11a640a798963372c8d91a06a8
This commit is contained in:
LIU Yulong 2018-12-25 17:45:05 +08:00
parent 2f3cc51784
commit 5b7d444b31
5 changed files with 87 additions and 23 deletions

View File

@ -51,6 +51,7 @@ from neutron.agent.l3 import namespace_manager
from neutron.agent.linux import external_process
from neutron.agent.linux import ip_lib
from neutron.agent.linux import pd
from neutron.agent.linux import utils as linux_utils
from neutron.agent.metadata import driver as metadata_driver
from neutron.agent import rpc as agent_rpc
from neutron.common import constants as l3_constants
@ -109,6 +110,7 @@ class L3PluginApi(object):
1.8 - Added address scope information
1.9 - Added get_router_ids
1.10 Added update_all_ha_network_port_statuses
1.11 Added get_host_ha_router_count
"""
def __init__(self, topic, host):
@ -184,6 +186,11 @@ class L3PluginApi(object):
return cctxt.call(context, 'delete_agent_gateway_port',
host=self.host, network_id=fip_net)
def get_host_ha_router_count(self, context):
"""Make a call to get the count of HA router."""
cctxt = self.client.prepare(version='1.11')
return cctxt.call(context, 'get_host_ha_router_count', host=self.host)
@profiler.trace_cls("l3-agent")
class L3NATAgent(ha.AgentMixin,
@ -228,21 +235,22 @@ class L3NATAgent(ha.AgentMixin,
self.fullsync = True
self.sync_routers_chunk_size = SYNC_ROUTERS_MAX_CHUNK_SIZE
# Get the list of service plugins from Neutron Server
# Get the HA router count from Neutron Server
# This is the first place where we contact neutron-server on startup
# so retry in case its not ready to respond.
while True:
try:
self.neutron_service_plugins = (
self.plugin_rpc.get_service_plugin_list(self.context))
self.ha_router_count = int(
self.plugin_rpc.get_host_ha_router_count(self.context))
except oslo_messaging.MessagingTimeout as e:
LOG.warning('l3-agent cannot contact neutron server '
'to retrieve service plugins enabled. '
'to retrieve HA router count. '
'Check connectivity to neutron server. '
'Retrying... '
'Detailed message: %(msg)s.', {'msg': e})
continue
break
LOG.info("Agent HA routers count %s", self.ha_router_count)
self.init_extension_manager(self.plugin_rpc)
@ -271,13 +279,49 @@ class L3NATAgent(ha.AgentMixin,
consumers = [[topics.NETWORK, topics.UPDATE]]
agent_rpc.create_consumers([self], topics.AGENT, consumers)
# We set HA network port status to DOWN to let l2 agent update it
# to ACTIVE after wiring. This allows us to spawn keepalived only
# when l2 agent finished wiring the port.
try:
self.plugin_rpc.update_all_ha_network_port_statuses(self.context)
except Exception:
LOG.exception('update_all_ha_network_port_statuses failed')
self._check_ha_router_process_status()
def _check_ha_router_process_status(self):
"""Check HA router VRRP process status in network node.
Check if the HA router HA routers VRRP (keepalived) process count
and state change python monitor process count meet the expected
quantity. If so, l3-agent will not call neutron to set all related
HA port to down state, this can prevent some unexpected VRRP
re-election. If not, a physical host may have down and just
restarted, set HA network port status to DOWN.
"""
if (self.conf.agent_mode not in [lib_const.L3_AGENT_MODE_DVR_SNAT,
lib_const.L3_AGENT_MODE_LEGACY]):
return
if self.ha_router_count <= 0:
return
# HA routers VRRP (keepalived) process count
vrrp_pcount = linux_utils.get_process_count_by_name("keepalived")
LOG.debug("VRRP process count %s.", vrrp_pcount)
# HA routers state change python monitor process count
vrrp_st_pcount = linux_utils.get_process_count_by_name(
"neutron-keepalived-state-change")
LOG.debug("neutron-keepalived-state-change process count %s.",
vrrp_st_pcount)
# Due to the process structure design of keepalived and the current
# config of l3-ha router, it will run one main 'keepalived' process
# and a child 'VRRP' process. So in the following check, we divided
# number of processes by 2 to match the ha router count.
if (not (vrrp_pcount / 2 >= self.ha_router_count and
vrrp_st_pcount >= self.ha_router_count)):
LOG.debug("Call neutron server to set HA port to DOWN state.")
try:
# We set HA network port status to DOWN to let l2 agent
# update it to ACTIVE after wiring. This allows us to spawn
# keepalived only when l2 agent finished wiring the port.
self.plugin_rpc.update_all_ha_network_port_statuses(
self.context)
except Exception:
LOG.exception('update_all_ha_network_port_statuses failed')
def _check_config_params(self):
"""Check items in configuration files.

View File

@ -198,6 +198,17 @@ def find_parent_pid(pid):
return ppid.strip()
def get_process_count_by_name(name):
"""Find the process count by name."""
try:
out = execute(['ps', '-C', name, '-o', 'comm='],
log_fail_as_error=False)
except exceptions.ProcessExecutionError:
with excutils.save_and_reraise_exception(reraise=False):
return 0
return len(out.strip('\n').split('\n'))
def find_fork_top_parent(pid):
"""Retrieve the pid of the top parent of the given pid through a fork.

View File

@ -47,7 +47,8 @@ class L3RpcCallback(object):
# 1.8 Added address scope information
# 1.9 Added get_router_ids
# 1.10 Added update_all_ha_network_port_statuses
target = oslo_messaging.Target(version='1.10')
# 1.11 Added get_host_ha_router_count
target = oslo_messaging.Target(version='1.11')
@property
def plugin(self):
@ -250,6 +251,9 @@ class L3RpcCallback(object):
def get_service_plugin_list(self, context, **kwargs):
return directory.get_plugins().keys()
def get_host_ha_router_count(self, context, host):
return self.l3plugin.get_host_ha_router_count(context, host)
@db_api.retry_db_errors
def update_floatingip_statuses(self, context, router_id, fip_statuses):
"""Update operational status for a floating IP."""

View File

@ -313,6 +313,13 @@ class L3AgentSchedulerDbMixin(l3agentscheduler.L3AgentSchedulerPluginBase,
return []
return self._get_router_ids_for_agent(context, agent, router_ids)
def get_host_ha_router_count(self, context, host):
router_ids = self.list_router_ids_on_host(context, host)
up_routers = l3_objs.Router.get_objects(context, id=router_ids,
admin_state_up=True)
return len(l3_objs.RouterExtraAttributes.get_objects(
context, router_id=[obj.id for obj in up_routers], ha=True))
def _get_router_ids_for_agent(self, context, agent, router_ids):
"""Get IDs of routers that the agent should host

View File

@ -25,7 +25,6 @@ from neutron_lib.api.definitions import portbindings
from neutron_lib import constants as lib_constants
from neutron_lib import exceptions as exc
from neutron_lib.exceptions import l3 as l3_exc
from neutron_lib.plugins import constants as plugin_constants
from oslo_config import cfg
from oslo_log import log
import oslo_messaging
@ -2855,23 +2854,22 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
ri.process_address_scope()
self.assertEqual(2, mocked_func.call_count)
def test_get_service_plugin_list(self):
service_plugins = [plugin_constants.L3]
self.plugin_api.get_service_plugin_list.return_value = service_plugins
def test_get_host_ha_router_count(self):
self.plugin_api.get_host_ha_router_count.return_value = 1
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.assertEqual(service_plugins, agent.neutron_service_plugins)
self.assertTrue(self.plugin_api.get_service_plugin_list.called)
self.assertEqual(1, agent.ha_router_count)
self.assertTrue(self.plugin_api.get_host_ha_router_count.called)
def test_get_service_plugin_list_retried(self):
def test_get_host_ha_router_count_retried(self):
raise_timeout = oslo_messaging.MessagingTimeout()
# Raise a timeout the first 2 times it calls
# get_service_plugin_list then return a empty tuple
self.plugin_api.get_service_plugin_list.side_effect = (
raise_timeout, tuple()
# get_host_ha_router_count then return 0
self.plugin_api.get_host_ha_router_count.side_effect = (
raise_timeout, 0
)
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.assertEqual(tuple(), agent.neutron_service_plugins)
self.assertEqual(0, agent.ha_router_count)
def test_external_gateway_removed_ext_gw_port_no_fip_ns(self):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)