Don't set HA ports down while L3 agent restart.
Because of the fix for bug[1] and issue with linux_utils get_process_count_by_name() L3 agent puts all it's HA ports down during initialization phase. Unfortunately such operation can break already working L3 communication. Rewiring ha-* port from down state to up can takes few seconds and some VRRP packages could be lost then. That triggers keepalived on other node so router HA state change may be triggered. This change prevents putting HA ports down when during initialization phase L3 agent finds already configured own net namespaces. Existance of such net namespace is a good proof that there is a network configuration existing so host wasn't rebooted so most probably it is just agent restart. [1] https://bugs.launchpad.net/neutron/+bug/1597461 Closes-Bug: #1959151 Change-Id: Id9c906b2d141c3bedd80fb5f868190f8a4b66f54
This commit is contained in:
parent
473f4db1d6
commit
f430cd0072
@ -51,9 +51,9 @@ from neutron.agent.l3 import l3_agent_extension_api as l3_ext_api
|
||||
from neutron.agent.l3 import l3_agent_extensions_manager as l3_ext_manager
|
||||
from neutron.agent.l3 import legacy_router
|
||||
from neutron.agent.l3 import namespace_manager
|
||||
from neutron.agent.l3 import namespaces as l3_namespaces
|
||||
from neutron.agent.linux import external_process
|
||||
from neutron.agent.linux import pd
|
||||
from neutron.agent.linux import utils as linux_utils
|
||||
from neutron.agent.metadata import driver as metadata_driver
|
||||
from neutron.agent import rpc as agent_rpc
|
||||
from neutron.common import utils
|
||||
@ -370,30 +370,23 @@ class L3NATAgent(ha.AgentMixin,
|
||||
if self.ha_router_count <= 0:
|
||||
return
|
||||
|
||||
# HA routers VRRP (keepalived) process count
|
||||
vrrp_pcount = linux_utils.get_process_count_by_name("keepalived")
|
||||
LOG.debug("VRRP process count %s.", vrrp_pcount)
|
||||
# HA routers state change python monitor process count
|
||||
vrrp_st_pcount = linux_utils.get_process_count_by_name(
|
||||
"neutron-keepalived-state-change")
|
||||
LOG.debug("neutron-keepalived-state-change process count %s.",
|
||||
vrrp_st_pcount)
|
||||
# Only set HA ports down when host was rebooted so no net
|
||||
# namespaces were still created.
|
||||
if any(ns.startswith(l3_namespaces.NS_PREFIX) for ns in
|
||||
self.namespaces_manager.list_all()):
|
||||
LOG.debug("Network configuration already done. Skipping"
|
||||
" set HA port to DOWN state.")
|
||||
return
|
||||
|
||||
# Due to the process structure design of keepalived and the current
|
||||
# config of l3-ha router, it will run one main 'keepalived' process
|
||||
# and a child 'VRRP' process. So in the following check, we divided
|
||||
# number of processes by 2 to match the ha router count.
|
||||
if (not (vrrp_pcount / 2 >= self.ha_router_count and
|
||||
vrrp_st_pcount >= self.ha_router_count)):
|
||||
LOG.debug("Call neutron server to set HA port to DOWN state.")
|
||||
try:
|
||||
# We set HA network port status to DOWN to let l2 agent
|
||||
# update it to ACTIVE after wiring. This allows us to spawn
|
||||
# keepalived only when l2 agent finished wiring the port.
|
||||
self.plugin_rpc.update_all_ha_network_port_statuses(
|
||||
self.context)
|
||||
except Exception:
|
||||
LOG.exception('update_all_ha_network_port_statuses failed')
|
||||
LOG.debug("Call neutron server to set HA port to DOWN state.")
|
||||
try:
|
||||
# We set HA network port status to DOWN to let l2 agent
|
||||
# update it to ACTIVE after wiring. This allows us to spawn
|
||||
# keepalived only when l2 agent finished wiring the port.
|
||||
self.plugin_rpc.update_all_ha_network_port_statuses(
|
||||
self.context)
|
||||
except Exception:
|
||||
LOG.exception('update_all_ha_network_port_statuses failed')
|
||||
|
||||
def _register_router_cls(self, factory):
|
||||
factory.register([], legacy_router.LegacyRouter)
|
||||
|
@ -13,6 +13,7 @@
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import contextlib
|
||||
import copy
|
||||
from itertools import chain as iter_chain
|
||||
from itertools import combinations as iter_combinations
|
||||
@ -20,6 +21,7 @@ import os
|
||||
import pwd
|
||||
from unittest import mock
|
||||
|
||||
import ddt
|
||||
import eventlet
|
||||
import fixtures
|
||||
import netaddr
|
||||
@ -214,6 +216,7 @@ class IptablesFixture(fixtures.Fixture):
|
||||
iptables_manager.IptablesManager.random_fully = self.random_fully
|
||||
|
||||
|
||||
@ddt.ddt
|
||||
class TestBasicRouterOperations(BasicRouterOperationsFramework):
|
||||
def setUp(self):
|
||||
super(TestBasicRouterOperations, self).setUp()
|
||||
@ -4221,3 +4224,20 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
|
||||
agent.stop()
|
||||
self.assertTrue(router.delete.called)
|
||||
self.assertTrue(agent._exiting)
|
||||
|
||||
@ddt.data(['fip-AAA', 'snat-BBB', 'qrouter-CCC'], [])
|
||||
def test_check_ha_router_process_status(self, namespaces):
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
with contextlib.ExitStack() as stack:
|
||||
list_all = stack.enter_context(mock.patch.object(
|
||||
agent.namespaces_manager, 'list_all'))
|
||||
update = stack.enter_context(mock.patch.object(
|
||||
agent.plugin_rpc, 'update_all_ha_network_port_statuses'))
|
||||
list_all.return_value = namespaces
|
||||
|
||||
agent._check_ha_router_process_status()
|
||||
|
||||
if not namespaces:
|
||||
update.assert_called_once()
|
||||
else:
|
||||
update.assert_not_called()
|
||||
|
@ -153,6 +153,10 @@ class TestDvrRouterOperations(base.BaseTestCase):
|
||||
self.ri_kwargs = {'agent_conf': self.conf,
|
||||
'interface_driver': self.mock_driver}
|
||||
|
||||
self.mock_list_all = mock.patch(
|
||||
'neutron.agent.l3.namespace_manager.NamespaceManager'
|
||||
'.list_all', return_value={}).start()
|
||||
|
||||
def _create_router(self, router=None, **kwargs):
|
||||
agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
|
||||
self.router_id = _uuid()
|
||||
|
@ -100,6 +100,8 @@ class TestMetadataDriverProcess(base.BaseTestCase):
|
||||
|
||||
def test_after_router_updated_called_on_agent_process_update(self):
|
||||
with mock.patch.object(metadata_driver, 'after_router_updated') as f,\
|
||||
mock.patch('neutron.agent.l3.namespace_manager.'
|
||||
'NamespaceManager.list_all', return_value={}),\
|
||||
mock.patch.object(router_info.RouterInfo, 'process'):
|
||||
agent = l3_agent.L3NATAgent('localhost')
|
||||
router_id = _uuid()
|
||||
@ -123,6 +125,8 @@ class TestMetadataDriverProcess(base.BaseTestCase):
|
||||
'apply'),\
|
||||
mock.patch.object(metadata_driver.MetadataDriver,
|
||||
'spawn_monitored_metadata_proxy'),\
|
||||
mock.patch('neutron.agent.l3.namespace_manager.'
|
||||
'NamespaceManager.list_all', return_value={}),\
|
||||
mock.patch.object(router_info.RouterInfo, 'process'):
|
||||
agent = l3_agent.L3NATAgent('localhost')
|
||||
router_id = _uuid()
|
||||
@ -144,7 +148,6 @@ class TestMetadataDriverProcess(base.BaseTestCase):
|
||||
cfg.CONF.set_override('metadata_proxy_socket', self.METADATA_SOCKET)
|
||||
cfg.CONF.set_override('debug', True)
|
||||
|
||||
agent = l3_agent.L3NATAgent('localhost')
|
||||
with mock.patch(ip_class_path) as ip_mock,\
|
||||
mock.patch(
|
||||
'neutron.agent.linux.external_process.'
|
||||
@ -155,9 +158,12 @@ class TestMetadataDriverProcess(base.BaseTestCase):
|
||||
mock.patch('grp.getgrnam',
|
||||
return_value=test_utils.FakeGroup(self.EGNAME)),\
|
||||
mock.patch('os.makedirs'),\
|
||||
mock.patch('neutron.agent.l3.namespace_manager.'
|
||||
'NamespaceManager.list_all', return_value={}),\
|
||||
mock.patch(
|
||||
'neutron.agent.linux.ip_lib.'
|
||||
'IpAddrCommand.wait_until_address_ready') as mock_wait:
|
||||
agent = l3_agent.L3NATAgent('localhost')
|
||||
cfg_file = os.path.join(
|
||||
metadata_driver.HaproxyConfigurator.get_config_path(
|
||||
agent.conf.state_path),
|
||||
|
Loading…
Reference in New Issue
Block a user