Don't set HA ports down while L3 agent restart.

Because of the fix for bug[1] and issue with linux_utils
get_process_count_by_name() L3 agent puts all it's HA ports down
during initialization phase. Unfortunately such operation can break
already working L3 communication. Rewiring ha-* port from down state to
up can takes few seconds and some VRRP packages could be lost then.
That triggers keepalived on other node so router HA state change
may be triggered.

This change prevents putting HA ports down when during initialization
phase L3 agent finds already configured own net namespaces. Existance
of such net namespace is a good proof that there is a network
configuration existing so host wasn't rebooted so most probably it is
just agent restart.

[1] https://bugs.launchpad.net/neutron/+bug/1597461

Closes-Bug: #1959151
Change-Id: Id9c906b2d141c3bedd80fb5f868190f8a4b66f54
This commit is contained in:
labedz 2022-01-27 00:13:40 +01:00 committed by Krzysztof Tomaszewski
parent 473f4db1d6
commit f430cd0072
4 changed files with 48 additions and 25 deletions

View File

@ -51,9 +51,9 @@ from neutron.agent.l3 import l3_agent_extension_api as l3_ext_api
from neutron.agent.l3 import l3_agent_extensions_manager as l3_ext_manager
from neutron.agent.l3 import legacy_router
from neutron.agent.l3 import namespace_manager
from neutron.agent.l3 import namespaces as l3_namespaces
from neutron.agent.linux import external_process
from neutron.agent.linux import pd
from neutron.agent.linux import utils as linux_utils
from neutron.agent.metadata import driver as metadata_driver
from neutron.agent import rpc as agent_rpc
from neutron.common import utils
@ -370,30 +370,23 @@ class L3NATAgent(ha.AgentMixin,
if self.ha_router_count <= 0:
return
# HA routers VRRP (keepalived) process count
vrrp_pcount = linux_utils.get_process_count_by_name("keepalived")
LOG.debug("VRRP process count %s.", vrrp_pcount)
# HA routers state change python monitor process count
vrrp_st_pcount = linux_utils.get_process_count_by_name(
"neutron-keepalived-state-change")
LOG.debug("neutron-keepalived-state-change process count %s.",
vrrp_st_pcount)
# Only set HA ports down when host was rebooted so no net
# namespaces were still created.
if any(ns.startswith(l3_namespaces.NS_PREFIX) for ns in
self.namespaces_manager.list_all()):
LOG.debug("Network configuration already done. Skipping"
" set HA port to DOWN state.")
return
# Due to the process structure design of keepalived and the current
# config of l3-ha router, it will run one main 'keepalived' process
# and a child 'VRRP' process. So in the following check, we divided
# number of processes by 2 to match the ha router count.
if (not (vrrp_pcount / 2 >= self.ha_router_count and
vrrp_st_pcount >= self.ha_router_count)):
LOG.debug("Call neutron server to set HA port to DOWN state.")
try:
# We set HA network port status to DOWN to let l2 agent
# update it to ACTIVE after wiring. This allows us to spawn
# keepalived only when l2 agent finished wiring the port.
self.plugin_rpc.update_all_ha_network_port_statuses(
self.context)
except Exception:
LOG.exception('update_all_ha_network_port_statuses failed')
LOG.debug("Call neutron server to set HA port to DOWN state.")
try:
# We set HA network port status to DOWN to let l2 agent
# update it to ACTIVE after wiring. This allows us to spawn
# keepalived only when l2 agent finished wiring the port.
self.plugin_rpc.update_all_ha_network_port_statuses(
self.context)
except Exception:
LOG.exception('update_all_ha_network_port_statuses failed')
def _register_router_cls(self, factory):
factory.register([], legacy_router.LegacyRouter)

View File

@ -13,6 +13,7 @@
# License for the specific language governing permissions and limitations
# under the License.
import contextlib
import copy
from itertools import chain as iter_chain
from itertools import combinations as iter_combinations
@ -20,6 +21,7 @@ import os
import pwd
from unittest import mock
import ddt
import eventlet
import fixtures
import netaddr
@ -214,6 +216,7 @@ class IptablesFixture(fixtures.Fixture):
iptables_manager.IptablesManager.random_fully = self.random_fully
@ddt.ddt
class TestBasicRouterOperations(BasicRouterOperationsFramework):
def setUp(self):
super(TestBasicRouterOperations, self).setUp()
@ -4221,3 +4224,20 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
agent.stop()
self.assertTrue(router.delete.called)
self.assertTrue(agent._exiting)
@ddt.data(['fip-AAA', 'snat-BBB', 'qrouter-CCC'], [])
def test_check_ha_router_process_status(self, namespaces):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
with contextlib.ExitStack() as stack:
list_all = stack.enter_context(mock.patch.object(
agent.namespaces_manager, 'list_all'))
update = stack.enter_context(mock.patch.object(
agent.plugin_rpc, 'update_all_ha_network_port_statuses'))
list_all.return_value = namespaces
agent._check_ha_router_process_status()
if not namespaces:
update.assert_called_once()
else:
update.assert_not_called()

View File

@ -153,6 +153,10 @@ class TestDvrRouterOperations(base.BaseTestCase):
self.ri_kwargs = {'agent_conf': self.conf,
'interface_driver': self.mock_driver}
self.mock_list_all = mock.patch(
'neutron.agent.l3.namespace_manager.NamespaceManager'
'.list_all', return_value={}).start()
def _create_router(self, router=None, **kwargs):
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
self.router_id = _uuid()

View File

@ -100,6 +100,8 @@ class TestMetadataDriverProcess(base.BaseTestCase):
def test_after_router_updated_called_on_agent_process_update(self):
with mock.patch.object(metadata_driver, 'after_router_updated') as f,\
mock.patch('neutron.agent.l3.namespace_manager.'
'NamespaceManager.list_all', return_value={}),\
mock.patch.object(router_info.RouterInfo, 'process'):
agent = l3_agent.L3NATAgent('localhost')
router_id = _uuid()
@ -123,6 +125,8 @@ class TestMetadataDriverProcess(base.BaseTestCase):
'apply'),\
mock.patch.object(metadata_driver.MetadataDriver,
'spawn_monitored_metadata_proxy'),\
mock.patch('neutron.agent.l3.namespace_manager.'
'NamespaceManager.list_all', return_value={}),\
mock.patch.object(router_info.RouterInfo, 'process'):
agent = l3_agent.L3NATAgent('localhost')
router_id = _uuid()
@ -144,7 +148,6 @@ class TestMetadataDriverProcess(base.BaseTestCase):
cfg.CONF.set_override('metadata_proxy_socket', self.METADATA_SOCKET)
cfg.CONF.set_override('debug', True)
agent = l3_agent.L3NATAgent('localhost')
with mock.patch(ip_class_path) as ip_mock,\
mock.patch(
'neutron.agent.linux.external_process.'
@ -155,9 +158,12 @@ class TestMetadataDriverProcess(base.BaseTestCase):
mock.patch('grp.getgrnam',
return_value=test_utils.FakeGroup(self.EGNAME)),\
mock.patch('os.makedirs'),\
mock.patch('neutron.agent.l3.namespace_manager.'
'NamespaceManager.list_all', return_value={}),\
mock.patch(
'neutron.agent.linux.ip_lib.'
'IpAddrCommand.wait_until_address_ready') as mock_wait:
agent = l3_agent.L3NATAgent('localhost')
cfg_file = os.path.join(
metadata_driver.HaproxyConfigurator.get_config_path(
agent.conf.state_path),