From 30f35e08f92e5262e7a9108684da048d11402b07 Mon Sep 17 00:00:00 2001 From: Swaminathan Vasudevan Date: Tue, 12 Feb 2019 11:27:51 -0800 Subject: [PATCH] Packets getting lost during SNAT with too many connections We have a problem with SNAT with too many connections using the same source and destination on the network nodes. In addition we can see in the conntrack table that the who "instert_failed" increases. This might be a generic problem with conntrack and linux. We suspect that we encounter the following "limitation / bug" in the kernel. There seems to be a workaround to alleviate this behavior by setting the -random-fully flag in iptables for port consumption. This patch fixes the problem by adding the --random-fully to the SNAT rules. Change-Id: I246c1f56df889bad9c7e140b56c3614124d80a19 Closes-Bug: #1814002 --- neutron/agent/l3/dvr_edge_router.py | 6 +- neutron/agent/l3/dvr_local_router.py | 7 ++- neutron/agent/l3/router_info.py | 26 ++++---- neutron/agent/linux/iptables_manager.py | 22 +++++++ neutron/common/_constants.py | 4 ++ neutron/common/utils.py | 7 +++ neutron/tests/unit/agent/l3/test_agent.py | 61 +++++++++++++++---- .../unit/agent/l3/test_dvr_local_router.py | 8 ++- neutron/tests/unit/common/test_utils.py | 11 ++++ 9 files changed, 122 insertions(+), 30 deletions(-) diff --git a/neutron/agent/l3/dvr_edge_router.py b/neutron/agent/l3/dvr_edge_router.py index a507e8268c2..20b59dc33eb 100644 --- a/neutron/agent/l3/dvr_edge_router.py +++ b/neutron/agent/l3/dvr_edge_router.py @@ -341,12 +341,14 @@ class DvrEdgeRouter(dvr_local_router.DvrLocalRouter): return lib_constants.FLOATINGIP_STATUS_ACTIVE def _centralized_floating_forward_rules(self, floating_ip, fixed_ip): + to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip) + if self.snat_iptables_manager.random_fully: + to_source += ' --random-fully' return [('PREROUTING', '-d %s/32 -j DNAT --to-destination %s' % (floating_ip, fixed_ip)), ('OUTPUT', '-d %s/32 -j DNAT --to-destination %s' % (floating_ip, fixed_ip)), - ('float-snat', '-s %s/32 -j SNAT --to-source %s' % - (fixed_ip, floating_ip))] + ('float-snat', to_source)] def _set_floating_ip_nat_rules_for_centralized_floatingip(self, fip): if fip.get(lib_constants.DVR_SNAT_BOUND): diff --git a/neutron/agent/l3/dvr_local_router.py b/neutron/agent/l3/dvr_local_router.py index 28557dd5962..59693025e77 100644 --- a/neutron/agent/l3/dvr_local_router.py +++ b/neutron/agent/l3/dvr_local_router.py @@ -74,9 +74,10 @@ class DvrLocalRouter(dvr_router_base.DvrRouterBase): dnat_from_floatingip_to_fixedip = ( 'PREROUTING', '-d %s/32 -i %s -j DNAT --to-destination %s' % ( floating_ip, rtr_2_fip_name, fixed_ip)) - snat_from_fixedip_to_floatingip = ( - 'float-snat', '-s %s/32 -j SNAT --to-source %s' % ( - fixed_ip, floating_ip)) + to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip) + if self.iptables_manager.random_fully: + to_source += ' --random-fully' + snat_from_fixedip_to_floatingip = ('float-snat', to_source) return [dnat_from_floatingip_to_fixedip, snat_from_fixedip_to_floatingip] diff --git a/neutron/agent/l3/router_info.py b/neutron/agent/l3/router_info.py index 9b44ff41262..9c260152bdc 100644 --- a/neutron/agent/l3/router_info.py +++ b/neutron/agent/l3/router_info.py @@ -172,12 +172,14 @@ class RouterInfo(object): def floating_forward_rules(self, fip): fixed_ip = fip['fixed_ip_address'] floating_ip = fip['floating_ip_address'] + to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip) + if self.iptables_manager.random_fully: + to_source += ' --random-fully' return [('PREROUTING', '-d %s/32 -j DNAT --to-destination %s' % (floating_ip, fixed_ip)), ('OUTPUT', '-d %s/32 -j DNAT --to-destination %s' % (floating_ip, fixed_ip)), - ('float-snat', '-s %s/32 -j SNAT --to-source %s' % - (fixed_ip, floating_ip))] + ('float-snat', to_source)] def floating_mangle_rules(self, floating_ip, fixed_ip, internal_mark): mark_traffic_to_floating_ip = ( @@ -854,19 +856,21 @@ class RouterInfo(object): self._prevent_snat_for_internal_traffic_rule(interface_name)) # Makes replies come back through the router to reverse DNAT ext_in_mark = self.agent_conf.external_ingress_mark - snat_internal_traffic_to_floating_ip = ( - 'snat', '-m mark ! --mark %s/%s ' - '-m conntrack --ctstate DNAT ' - '-j SNAT --to-source %s' - % (ext_in_mark, lib_constants.ROUTER_MARK_MASK, ex_gw_ip)) + to_source = ('-m mark ! --mark %s/%s ' + '-m conntrack --ctstate DNAT ' + '-j SNAT --to-source %s' + % (ext_in_mark, lib_constants.ROUTER_MARK_MASK, ex_gw_ip)) + if self.iptables_manager.random_fully: + to_source += ' --random-fully' + snat_internal_traffic_to_floating_ip = ('snat', to_source) return [dont_snat_traffic_to_internal_ports_if_not_to_floating_ip, snat_internal_traffic_to_floating_ip] def external_gateway_nat_snat_rules(self, ex_gw_ip, interface_name): - snat_normal_external_traffic = ( - 'snat', '-o %s -j SNAT --to-source %s' % - (interface_name, ex_gw_ip)) - return [snat_normal_external_traffic] + to_source = '-o %s -j SNAT --to-source %s' % (interface_name, ex_gw_ip) + if self.iptables_manager.random_fully: + to_source += ' --random-fully' + return [('snat', to_source)] def external_gateway_mangle_rules(self, interface_name): mark = self.agent_conf.external_ingress_mark diff --git a/neutron/agent/linux/iptables_manager.py b/neutron/agent/linux/iptables_manager.py index 45c021484cb..781bedf725e 100644 --- a/neutron/agent/linux/iptables_manager.py +++ b/neutron/agent/linux/iptables_manager.py @@ -38,6 +38,8 @@ from neutron._i18n import _ from neutron.agent.linux import ip_lib from neutron.agent.linux import iptables_comments as ic from neutron.agent.linux import utils as linux_utils +from neutron.common import _constants as n_const +from neutron.common import utils from neutron.conf.agent import common as config LOG = logging.getLogger(__name__) @@ -302,6 +304,9 @@ class IptablesManager(object): # run iptables-restore without it. use_table_lock = False + # Flag to denote iptables supports --random-fully argument + _random_fully = None + def __init__(self, _execute=None, state_less=False, use_ipv6=False, namespace=None, binary_name=binary_name): if _execute: @@ -473,6 +478,23 @@ class IptablesManager(object): args = ['ip', 'netns', 'exec', self.namespace] + args return self.execute(args, run_as_root=True).split('\n') + def _get_version(self): + # Output example is "iptables v1.6.2" + args = ['iptables', '--version'] + version = str(self.execute(args, run_as_root=True).split()[1][1:]) + LOG.debug("IPTables version installed: %s", version) + return version + + @property + def random_fully(self): + if self._random_fully is not None: + return self._random_fully + + version = self._get_version() + self.__class__._random_fully = utils.is_version_greater_equal( + version, n_const.IPTABLES_RANDOM_FULLY_VERSION) + return self._random_fully + @property def xlock_wait_time(self): # give agent some time to report back to server diff --git a/neutron/common/_constants.py b/neutron/common/_constants.py index d2d5337a5b0..13056a51288 100644 --- a/neutron/common/_constants.py +++ b/neutron/common/_constants.py @@ -56,3 +56,7 @@ AGENT_RES_PROCESSING_STEP = 100 # Number of resources for neutron to divide the large RPC # call data sets. RPC_RES_PROCESSING_STEP = 20 + +# IPtables version to support --random-fully option. +# Do not move this constant to neutron-lib, since it is temporary +IPTABLES_RANDOM_FULLY_VERSION = '1.6.2' diff --git a/neutron/common/utils.py b/neutron/common/utils.py index a8bad8fa71a..396f398d354 100644 --- a/neutron/common/utils.py +++ b/neutron/common/utils.py @@ -41,6 +41,7 @@ from oslo_config import cfg from oslo_db import exception as db_exc from oslo_log import log as logging from oslo_utils import excutils +import pkg_resources import six import neutron @@ -312,6 +313,12 @@ def get_socket_address_family(ip_version): else socket.AF_INET6)) +def is_version_greater_equal(version1, version2): + """Returns True if version1 is greater or equal than version2 else False""" + return (pkg_resources.parse_version(version1) >= + pkg_resources.parse_version(version2)) + + class DelayedStringRenderer(object): """Takes a callable and its args and calls when __str__ is called diff --git a/neutron/tests/unit/agent/l3/test_agent.py b/neutron/tests/unit/agent/l3/test_agent.py index b211b3c4429..e8cd8a3f1e0 100644 --- a/neutron/tests/unit/agent/l3/test_agent.py +++ b/neutron/tests/unit/agent/l3/test_agent.py @@ -18,6 +18,7 @@ from itertools import chain as iter_chain from itertools import combinations as iter_combinations import eventlet +import fixtures import mock import netaddr from neutron_lib.agent import constants as agent_consts @@ -192,10 +193,28 @@ class BasicRouterOperationsFramework(base.BaseTestCase): ri.process() +class IptablesFixture(fixtures.Fixture): + def _setUp(self): + # We MUST save and restore random_fully because it is a class + # attribute and could change state in some tests, which can cause + # the other router test cases to randomly fail due to race conditions. + self.random_fully = iptables_manager.IptablesManager.random_fully + iptables_manager.IptablesManager.random_fully = True + self.addCleanup(self._reset) + + def _reset(self): + iptables_manager.IptablesManager.random_fully = self.random_fully + + class TestBasicRouterOperations(BasicRouterOperationsFramework): + def setUp(self): + super(TestBasicRouterOperations, self).setUp() + self.useFixture(IptablesFixture()) + def test_request_id_changes(self): a = l3_agent.L3NATAgent(HOSTNAME, self.conf) self.assertNotEqual(a.context.request_id, a.context.request_id) + self.useFixture(IptablesFixture()) def test_init_ha_conf(self): with mock.patch('os.path.dirname', return_value='/etc/ha/'): @@ -1022,7 +1041,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): self._test_external_gateway_action('remove', router, dual_stack=True) def _verify_snat_mangle_rules(self, nat_rules, mangle_rules, router, - negate=False): + random_fully, negate=False): interfaces = router[lib_constants.INTERFACE_KEY] source_cidrs = [] for iface in interfaces: @@ -1033,13 +1052,18 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): source_cidrs.append(source_cidr) source_nat_ip = router['gw_port']['fixed_ips'][0]['ip_address'] interface_name = ('qg-%s' % router['gw_port']['id'])[:14] + mask_rule = ('-m mark ! --mark 0x2/%s -m conntrack --ctstate DNAT ' + '-j SNAT --to-source %s' % + (lib_constants.ROUTER_MARK_MASK, source_nat_ip)) + snat_rule = ('-o %s -j SNAT --to-source %s' % + (interface_name, source_nat_ip)) + if random_fully: + mask_rule += ' --random-fully' + snat_rule += ' --random-fully' expected_rules = [ '! -i %s ! -o %s -m conntrack ! --ctstate DNAT -j ACCEPT' % (interface_name, interface_name), - '-o %s -j SNAT --to-source %s' % (interface_name, source_nat_ip), - '-m mark ! --mark 0x2/%s -m conntrack --ctstate DNAT ' - '-j SNAT --to-source %s' % - (lib_constants.ROUTER_MARK_MASK, source_nat_ip)] + mask_rule, snat_rule] for r in nat_rules: if negate: self.assertNotIn(r.rule, expected_rules) @@ -1631,7 +1655,8 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): ri.get_external_device_name = mock.Mock(return_value='exgw') self._test_process_floating_ip_addresses_add(ri, agent) - def test_process_router_snat_disabled(self): + def _test_process_router_snat_disabled(self, random_fully): + iptables_manager.IptablesManager.random_fully = random_fully agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) router = l3_test_common.prepare_router_data(enable_snat=True) ri = l3router.RouterInfo(agent, router['id'], router, **self.ri_kwargs) @@ -1655,10 +1680,17 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): if r not in ri.iptables_manager.ipv4['mangle'].rules] self.assertEqual(1, len(mangle_rules_delta)) self._verify_snat_mangle_rules(nat_rules_delta, mangle_rules_delta, - router) + router, random_fully) self.assertEqual(1, self.send_adv_notif.call_count) - def test_process_router_snat_enabled(self): + def test_process_router_snat_disabled_random_fully(self): + self._test_process_router_snat_disabled(True) + + def test_process_router_snat_disabled_random_fully_false(self): + self._test_process_router_snat_disabled(False) + + def _test_process_router_snat_enabled(self, random_fully): + iptables_manager.IptablesManager.random_fully = random_fully agent = l3_agent.L3NATAgent(HOSTNAME, self.conf) router = l3_test_common.prepare_router_data(enable_snat=False) ri = l3router.RouterInfo(agent, router['id'], router, **self.ri_kwargs) @@ -1682,9 +1714,15 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): if r not in orig_mangle_rules] self.assertEqual(1, len(mangle_rules_delta)) self._verify_snat_mangle_rules(nat_rules_delta, mangle_rules_delta, - router) + router, random_fully) self.assertEqual(1, self.send_adv_notif.call_count) + def test_process_router_snat_enabled_random_fully(self): + self._test_process_router_snat_enabled(True) + + def test_process_router_snat_enabled_random_fully_false(self): + self._test_process_router_snat_enabled(False) + def _test_update_routing_table(self, is_snat_host=True): router = l3_test_common.prepare_router_data() uuid = router['id'] @@ -2292,11 +2330,12 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework): jump_float_rule = "-A %s-snat -j %s-float-snat" % (wrap_name, wrap_name) - snat_rule1 = ("-A %s-snat -o iface -j SNAT --to-source %s") % ( + snat_rule1 = ("-A %s-snat -o iface -j SNAT --to-source %s " + "--random-fully") % ( wrap_name, ex_gw_port['fixed_ips'][0]['ip_address']) snat_rule2 = ("-A %s-snat -m mark ! --mark 0x2/%s " "-m conntrack --ctstate DNAT " - "-j SNAT --to-source %s") % ( + "-j SNAT --to-source %s --random-fully") % ( wrap_name, lib_constants.ROUTER_MARK_MASK, ex_gw_port['fixed_ips'][0]['ip_address']) diff --git a/neutron/tests/unit/agent/l3/test_dvr_local_router.py b/neutron/tests/unit/agent/l3/test_dvr_local_router.py index bdb7cecc07c..d9ad4138432 100644 --- a/neutron/tests/unit/agent/l3/test_dvr_local_router.py +++ b/neutron/tests/unit/agent/l3/test_dvr_local_router.py @@ -259,9 +259,11 @@ class TestDvrRouterOperations(base.BaseTestCase): dnat_from_floatingip_to_fixedip = ( 'PREROUTING', '-d %s/32 -i %s -j DNAT --to-destination %s' % ( floating_ip, rtr_2_fip_name, fixed_ip)) - snat_from_fixedip_to_floatingip = ( - 'float-snat', '-s %s/32 -j SNAT --to-source %s' % ( - fixed_ip, floating_ip)) + to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip) + + if ri.iptables_manager.random_fully: + to_source += ' --random-fully' + snat_from_fixedip_to_floatingip = ('float-snat', to_source) actual = ri.floating_forward_rules(fip) expected = [dnat_from_floatingip_to_fixedip, snat_from_fixedip_to_floatingip] diff --git a/neutron/tests/unit/common/test_utils.py b/neutron/tests/unit/common/test_utils.py index 92c82522cac..c21ad669d3b 100644 --- a/neutron/tests/unit/common/test_utils.py +++ b/neutron/tests/unit/common/test_utils.py @@ -281,6 +281,17 @@ class TestIpVersionFromInt(base.BaseTestCase): 8) +class TestIsVersionGreaterEqual(base.BaseTestCase): + def test_is_version_greater_equal_greater(self): + self.assertTrue(utils.is_version_greater_equal('1.6.2', '1.6.0')) + + def test_is_version_greater_equal_equal(self): + self.assertTrue(utils.is_version_greater_equal('1.6.2', '1.6.2')) + + def test_is_version_greater_equal_less(self): + self.assertFalse(utils.is_version_greater_equal('1.6.0', '1.6.2')) + + class TestDelayedStringRenderer(base.BaseTestCase): def test_call_deferred_until_str(self): my_func = mock.MagicMock(return_value='Brie cheese!')