Browse Source

Packets getting lost during SNAT with too many connections

We have a problem with SNAT with too many connections using the
same source and destination on the network nodes.

In addition we can see in the conntrack table that the who
"instert_failed" increases.

This might be a generic problem with conntrack and linux.
We suspect that we encounter the following "limitation / bug"
in the kernel.

There seems to be a workaround to alleviate this behavior by
setting the -random-fully flag in iptables for port consumption.

This patch fixes the problem by adding the --random-fully to
the SNAT rules.

Change-Id: I246c1f56df889bad9c7e140b56c3614124d80a19
Closes-Bug: #1814002
changes/73/636473/32
Swaminathan Vasudevan 4 months ago
parent
commit
30f35e08f9

+ 4
- 2
neutron/agent/l3/dvr_edge_router.py View File

@@ -341,12 +341,14 @@ class DvrEdgeRouter(dvr_local_router.DvrLocalRouter):
341 341
         return lib_constants.FLOATINGIP_STATUS_ACTIVE
342 342
 
343 343
     def _centralized_floating_forward_rules(self, floating_ip, fixed_ip):
344
+        to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
345
+        if self.snat_iptables_manager.random_fully:
346
+            to_source += ' --random-fully'
344 347
         return [('PREROUTING', '-d %s/32 -j DNAT --to-destination %s' %
345 348
                  (floating_ip, fixed_ip)),
346 349
                 ('OUTPUT', '-d %s/32 -j DNAT --to-destination %s' %
347 350
                  (floating_ip, fixed_ip)),
348
-                ('float-snat', '-s %s/32 -j SNAT --to-source %s' %
349
-                 (fixed_ip, floating_ip))]
351
+                ('float-snat', to_source)]
350 352
 
351 353
     def _set_floating_ip_nat_rules_for_centralized_floatingip(self, fip):
352 354
         if fip.get(lib_constants.DVR_SNAT_BOUND):

+ 4
- 3
neutron/agent/l3/dvr_local_router.py View File

@@ -74,9 +74,10 @@ class DvrLocalRouter(dvr_router_base.DvrRouterBase):
74 74
         dnat_from_floatingip_to_fixedip = (
75 75
             'PREROUTING', '-d %s/32 -i %s -j DNAT --to-destination %s' % (
76 76
                 floating_ip, rtr_2_fip_name, fixed_ip))
77
-        snat_from_fixedip_to_floatingip = (
78
-            'float-snat', '-s %s/32 -j SNAT --to-source %s' % (
79
-                fixed_ip, floating_ip))
77
+        to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
78
+        if self.iptables_manager.random_fully:
79
+            to_source += ' --random-fully'
80
+        snat_from_fixedip_to_floatingip = ('float-snat', to_source)
80 81
         return [dnat_from_floatingip_to_fixedip,
81 82
                 snat_from_fixedip_to_floatingip]
82 83
 

+ 15
- 11
neutron/agent/l3/router_info.py View File

@@ -172,12 +172,14 @@ class RouterInfo(object):
172 172
     def floating_forward_rules(self, fip):
173 173
         fixed_ip = fip['fixed_ip_address']
174 174
         floating_ip = fip['floating_ip_address']
175
+        to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
176
+        if self.iptables_manager.random_fully:
177
+            to_source += ' --random-fully'
175 178
         return [('PREROUTING', '-d %s/32 -j DNAT --to-destination %s' %
176 179
                  (floating_ip, fixed_ip)),
177 180
                 ('OUTPUT', '-d %s/32 -j DNAT --to-destination %s' %
178 181
                  (floating_ip, fixed_ip)),
179
-                ('float-snat', '-s %s/32 -j SNAT --to-source %s' %
180
-                 (fixed_ip, floating_ip))]
182
+                ('float-snat', to_source)]
181 183
 
182 184
     def floating_mangle_rules(self, floating_ip, fixed_ip, internal_mark):
183 185
         mark_traffic_to_floating_ip = (
@@ -854,19 +856,21 @@ class RouterInfo(object):
854 856
             self._prevent_snat_for_internal_traffic_rule(interface_name))
855 857
         # Makes replies come back through the router to reverse DNAT
856 858
         ext_in_mark = self.agent_conf.external_ingress_mark
857
-        snat_internal_traffic_to_floating_ip = (
858
-            'snat', '-m mark ! --mark %s/%s '
859
-                    '-m conntrack --ctstate DNAT '
860
-                    '-j SNAT --to-source %s'
861
-                    % (ext_in_mark, lib_constants.ROUTER_MARK_MASK, ex_gw_ip))
859
+        to_source = ('-m mark ! --mark %s/%s '
860
+                     '-m conntrack --ctstate DNAT '
861
+                     '-j SNAT --to-source %s'
862
+                     % (ext_in_mark, lib_constants.ROUTER_MARK_MASK, ex_gw_ip))
863
+        if self.iptables_manager.random_fully:
864
+            to_source += ' --random-fully'
865
+        snat_internal_traffic_to_floating_ip = ('snat', to_source)
862 866
         return [dont_snat_traffic_to_internal_ports_if_not_to_floating_ip,
863 867
                 snat_internal_traffic_to_floating_ip]
864 868
 
865 869
     def external_gateway_nat_snat_rules(self, ex_gw_ip, interface_name):
866
-        snat_normal_external_traffic = (
867
-            'snat', '-o %s -j SNAT --to-source %s' %
868
-                    (interface_name, ex_gw_ip))
869
-        return [snat_normal_external_traffic]
870
+        to_source = '-o %s -j SNAT --to-source %s' % (interface_name, ex_gw_ip)
871
+        if self.iptables_manager.random_fully:
872
+            to_source += ' --random-fully'
873
+        return [('snat', to_source)]
870 874
 
871 875
     def external_gateway_mangle_rules(self, interface_name):
872 876
         mark = self.agent_conf.external_ingress_mark

+ 22
- 0
neutron/agent/linux/iptables_manager.py View File

@@ -38,6 +38,8 @@ from neutron._i18n import _
38 38
 from neutron.agent.linux import ip_lib
39 39
 from neutron.agent.linux import iptables_comments as ic
40 40
 from neutron.agent.linux import utils as linux_utils
41
+from neutron.common import _constants as n_const
42
+from neutron.common import utils
41 43
 from neutron.conf.agent import common as config
42 44
 
43 45
 LOG = logging.getLogger(__name__)
@@ -302,6 +304,9 @@ class IptablesManager(object):
302 304
     # run iptables-restore without it.
303 305
     use_table_lock = False
304 306
 
307
+    # Flag to denote iptables supports --random-fully argument
308
+    _random_fully = None
309
+
305 310
     def __init__(self, _execute=None, state_less=False, use_ipv6=False,
306 311
                  namespace=None, binary_name=binary_name):
307 312
         if _execute:
@@ -473,6 +478,23 @@ class IptablesManager(object):
473 478
             args = ['ip', 'netns', 'exec', self.namespace] + args
474 479
         return self.execute(args, run_as_root=True).split('\n')
475 480
 
481
+    def _get_version(self):
482
+        # Output example is "iptables v1.6.2"
483
+        args = ['iptables', '--version']
484
+        version = str(self.execute(args, run_as_root=True).split()[1][1:])
485
+        LOG.debug("IPTables version installed: %s", version)
486
+        return version
487
+
488
+    @property
489
+    def random_fully(self):
490
+        if self._random_fully is not None:
491
+            return self._random_fully
492
+
493
+        version = self._get_version()
494
+        self.__class__._random_fully = utils.is_version_greater_equal(
495
+            version, n_const.IPTABLES_RANDOM_FULLY_VERSION)
496
+        return self._random_fully
497
+
476 498
     @property
477 499
     def xlock_wait_time(self):
478 500
         # give agent some time to report back to server

+ 4
- 0
neutron/common/_constants.py View File

@@ -56,3 +56,7 @@ AGENT_RES_PROCESSING_STEP = 100
56 56
 # Number of resources for neutron to divide the large RPC
57 57
 # call data sets.
58 58
 RPC_RES_PROCESSING_STEP = 20
59
+
60
+# IPtables version to support --random-fully option.
61
+# Do not move this constant to neutron-lib, since it is temporary
62
+IPTABLES_RANDOM_FULLY_VERSION = '1.6.2'

+ 7
- 0
neutron/common/utils.py View File

@@ -41,6 +41,7 @@ from oslo_config import cfg
41 41
 from oslo_db import exception as db_exc
42 42
 from oslo_log import log as logging
43 43
 from oslo_utils import excutils
44
+import pkg_resources
44 45
 import six
45 46
 
46 47
 import neutron
@@ -312,6 +313,12 @@ def get_socket_address_family(ip_version):
312 313
                 else socket.AF_INET6))
313 314
 
314 315
 
316
+def is_version_greater_equal(version1, version2):
317
+    """Returns True if version1 is greater or equal than version2 else False"""
318
+    return (pkg_resources.parse_version(version1) >=
319
+            pkg_resources.parse_version(version2))
320
+
321
+
315 322
 class DelayedStringRenderer(object):
316 323
     """Takes a callable and its args and calls when __str__ is called
317 324
 

+ 50
- 11
neutron/tests/unit/agent/l3/test_agent.py View File

@@ -18,6 +18,7 @@ from itertools import chain as iter_chain
18 18
 from itertools import combinations as iter_combinations
19 19
 
20 20
 import eventlet
21
+import fixtures
21 22
 import mock
22 23
 import netaddr
23 24
 from neutron_lib.agent import constants as agent_consts
@@ -192,10 +193,28 @@ class BasicRouterOperationsFramework(base.BaseTestCase):
192 193
         ri.process()
193 194
 
194 195
 
196
+class IptablesFixture(fixtures.Fixture):
197
+    def _setUp(self):
198
+        # We MUST save and restore random_fully because it is a class
199
+        # attribute and could change state in some tests, which can cause
200
+        # the other router test cases to randomly fail due to race conditions.
201
+        self.random_fully = iptables_manager.IptablesManager.random_fully
202
+        iptables_manager.IptablesManager.random_fully = True
203
+        self.addCleanup(self._reset)
204
+
205
+    def _reset(self):
206
+        iptables_manager.IptablesManager.random_fully = self.random_fully
207
+
208
+
195 209
 class TestBasicRouterOperations(BasicRouterOperationsFramework):
210
+    def setUp(self):
211
+        super(TestBasicRouterOperations, self).setUp()
212
+        self.useFixture(IptablesFixture())
213
+
196 214
     def test_request_id_changes(self):
197 215
         a = l3_agent.L3NATAgent(HOSTNAME, self.conf)
198 216
         self.assertNotEqual(a.context.request_id, a.context.request_id)
217
+        self.useFixture(IptablesFixture())
199 218
 
200 219
     def test_init_ha_conf(self):
201 220
         with mock.patch('os.path.dirname', return_value='/etc/ha/'):
@@ -1022,7 +1041,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1022 1041
         self._test_external_gateway_action('remove', router, dual_stack=True)
1023 1042
 
1024 1043
     def _verify_snat_mangle_rules(self, nat_rules, mangle_rules, router,
1025
-                                  negate=False):
1044
+                                  random_fully, negate=False):
1026 1045
         interfaces = router[lib_constants.INTERFACE_KEY]
1027 1046
         source_cidrs = []
1028 1047
         for iface in interfaces:
@@ -1033,13 +1052,18 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1033 1052
                 source_cidrs.append(source_cidr)
1034 1053
         source_nat_ip = router['gw_port']['fixed_ips'][0]['ip_address']
1035 1054
         interface_name = ('qg-%s' % router['gw_port']['id'])[:14]
1055
+        mask_rule = ('-m mark ! --mark 0x2/%s -m conntrack --ctstate DNAT '
1056
+                     '-j SNAT --to-source %s' %
1057
+                     (lib_constants.ROUTER_MARK_MASK, source_nat_ip))
1058
+        snat_rule = ('-o %s -j SNAT --to-source %s' %
1059
+                     (interface_name, source_nat_ip))
1060
+        if random_fully:
1061
+            mask_rule += ' --random-fully'
1062
+            snat_rule += ' --random-fully'
1036 1063
         expected_rules = [
1037 1064
             '! -i %s ! -o %s -m conntrack ! --ctstate DNAT -j ACCEPT' %
1038 1065
             (interface_name, interface_name),
1039
-            '-o %s -j SNAT --to-source %s' % (interface_name, source_nat_ip),
1040
-            '-m mark ! --mark 0x2/%s -m conntrack --ctstate DNAT '
1041
-            '-j SNAT --to-source %s' %
1042
-            (lib_constants.ROUTER_MARK_MASK, source_nat_ip)]
1066
+            mask_rule, snat_rule]
1043 1067
         for r in nat_rules:
1044 1068
             if negate:
1045 1069
                 self.assertNotIn(r.rule, expected_rules)
@@ -1631,7 +1655,8 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1631 1655
         ri.get_external_device_name = mock.Mock(return_value='exgw')
1632 1656
         self._test_process_floating_ip_addresses_add(ri, agent)
1633 1657
 
1634
-    def test_process_router_snat_disabled(self):
1658
+    def _test_process_router_snat_disabled(self, random_fully):
1659
+        iptables_manager.IptablesManager.random_fully = random_fully
1635 1660
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
1636 1661
         router = l3_test_common.prepare_router_data(enable_snat=True)
1637 1662
         ri = l3router.RouterInfo(agent, router['id'], router, **self.ri_kwargs)
@@ -1655,10 +1680,17 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1655 1680
             if r not in ri.iptables_manager.ipv4['mangle'].rules]
1656 1681
         self.assertEqual(1, len(mangle_rules_delta))
1657 1682
         self._verify_snat_mangle_rules(nat_rules_delta, mangle_rules_delta,
1658
-                                       router)
1683
+                                       router, random_fully)
1659 1684
         self.assertEqual(1, self.send_adv_notif.call_count)
1660 1685
 
1661
-    def test_process_router_snat_enabled(self):
1686
+    def test_process_router_snat_disabled_random_fully(self):
1687
+        self._test_process_router_snat_disabled(True)
1688
+
1689
+    def test_process_router_snat_disabled_random_fully_false(self):
1690
+        self._test_process_router_snat_disabled(False)
1691
+
1692
+    def _test_process_router_snat_enabled(self, random_fully):
1693
+        iptables_manager.IptablesManager.random_fully = random_fully
1662 1694
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
1663 1695
         router = l3_test_common.prepare_router_data(enable_snat=False)
1664 1696
         ri = l3router.RouterInfo(agent, router['id'], router, **self.ri_kwargs)
@@ -1682,9 +1714,15 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1682 1714
             if r not in orig_mangle_rules]
1683 1715
         self.assertEqual(1, len(mangle_rules_delta))
1684 1716
         self._verify_snat_mangle_rules(nat_rules_delta, mangle_rules_delta,
1685
-                                       router)
1717
+                                       router, random_fully)
1686 1718
         self.assertEqual(1, self.send_adv_notif.call_count)
1687 1719
 
1720
+    def test_process_router_snat_enabled_random_fully(self):
1721
+        self._test_process_router_snat_enabled(True)
1722
+
1723
+    def test_process_router_snat_enabled_random_fully_false(self):
1724
+        self._test_process_router_snat_enabled(False)
1725
+
1688 1726
     def _test_update_routing_table(self, is_snat_host=True):
1689 1727
         router = l3_test_common.prepare_router_data()
1690 1728
         uuid = router['id']
@@ -2292,11 +2330,12 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
2292 2330
 
2293 2331
         jump_float_rule = "-A %s-snat -j %s-float-snat" % (wrap_name,
2294 2332
                                                            wrap_name)
2295
-        snat_rule1 = ("-A %s-snat -o iface -j SNAT --to-source %s") % (
2333
+        snat_rule1 = ("-A %s-snat -o iface -j SNAT --to-source %s "
2334
+                      "--random-fully") % (
2296 2335
             wrap_name, ex_gw_port['fixed_ips'][0]['ip_address'])
2297 2336
         snat_rule2 = ("-A %s-snat -m mark ! --mark 0x2/%s "
2298 2337
                       "-m conntrack --ctstate DNAT "
2299
-                      "-j SNAT --to-source %s") % (
2338
+                      "-j SNAT --to-source %s --random-fully") % (
2300 2339
             wrap_name, lib_constants.ROUTER_MARK_MASK,
2301 2340
             ex_gw_port['fixed_ips'][0]['ip_address'])
2302 2341
 

+ 5
- 3
neutron/tests/unit/agent/l3/test_dvr_local_router.py View File

@@ -259,9 +259,11 @@ class TestDvrRouterOperations(base.BaseTestCase):
259 259
         dnat_from_floatingip_to_fixedip = (
260 260
             'PREROUTING', '-d %s/32 -i %s -j DNAT --to-destination %s' % (
261 261
                 floating_ip, rtr_2_fip_name, fixed_ip))
262
-        snat_from_fixedip_to_floatingip = (
263
-            'float-snat', '-s %s/32 -j SNAT --to-source %s' % (
264
-                fixed_ip, floating_ip))
262
+        to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
263
+
264
+        if ri.iptables_manager.random_fully:
265
+            to_source += ' --random-fully'
266
+        snat_from_fixedip_to_floatingip = ('float-snat', to_source)
265 267
         actual = ri.floating_forward_rules(fip)
266 268
         expected = [dnat_from_floatingip_to_fixedip,
267 269
                     snat_from_fixedip_to_floatingip]

+ 11
- 0
neutron/tests/unit/common/test_utils.py View File

@@ -281,6 +281,17 @@ class TestIpVersionFromInt(base.BaseTestCase):
281 281
                           8)
282 282
 
283 283
 
284
+class TestIsVersionGreaterEqual(base.BaseTestCase):
285
+    def test_is_version_greater_equal_greater(self):
286
+        self.assertTrue(utils.is_version_greater_equal('1.6.2', '1.6.0'))
287
+
288
+    def test_is_version_greater_equal_equal(self):
289
+        self.assertTrue(utils.is_version_greater_equal('1.6.2', '1.6.2'))
290
+
291
+    def test_is_version_greater_equal_less(self):
292
+        self.assertFalse(utils.is_version_greater_equal('1.6.0', '1.6.2'))
293
+
294
+
284 295
 class TestDelayedStringRenderer(base.BaseTestCase):
285 296
     def test_call_deferred_until_str(self):
286 297
         my_func = mock.MagicMock(return_value='Brie cheese!')

Loading…
Cancel
Save