Browse Source

Packets getting lost during SNAT with too many connections

We have a problem with SNAT with too many connections using the
same source and destination on the network nodes.

In addition we can see in the conntrack table that the who
"instert_failed" increases.

This might be a generic problem with conntrack and linux.
We suspect that we encounter the following "limitation / bug"
in the kernel.

There seems to be a workaround to alleviate this behavior by
setting the -random-fully flag in iptables for port consumption.

This patch fixes the problem by adding the --random-fully to
the SNAT rules.

Conflicts:
    neutron/agent/linux/iptables_manager.py
    neutron/common/constants.py
    neutron/tests/unit/agent/l3/test_agent.py

Change-Id: I246c1f56df889bad9c7e140b56c3614124d80a19
Closes-Bug: #1814002
(cherry picked from commit 30f35e08f9)
changes/90/655790/2
Swaminathan Vasudevan 4 months ago
parent
commit
eded5d2d6a

+ 4
- 2
neutron/agent/l3/dvr_edge_router.py View File

@@ -341,12 +341,14 @@ class DvrEdgeRouter(dvr_local_router.DvrLocalRouter):
341 341
         return lib_constants.FLOATINGIP_STATUS_ACTIVE
342 342
 
343 343
     def _centralized_floating_forward_rules(self, floating_ip, fixed_ip):
344
+        to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
345
+        if self.snat_iptables_manager.random_fully:
346
+            to_source += ' --random-fully'
344 347
         return [('PREROUTING', '-d %s/32 -j DNAT --to-destination %s' %
345 348
                  (floating_ip, fixed_ip)),
346 349
                 ('OUTPUT', '-d %s/32 -j DNAT --to-destination %s' %
347 350
                  (floating_ip, fixed_ip)),
348
-                ('float-snat', '-s %s/32 -j SNAT --to-source %s' %
349
-                 (fixed_ip, floating_ip))]
351
+                ('float-snat', to_source)]
350 352
 
351 353
     def _set_floating_ip_nat_rules_for_centralized_floatingip(self, fip):
352 354
         if fip.get(lib_constants.DVR_SNAT_BOUND):

+ 4
- 3
neutron/agent/l3/dvr_local_router.py View File

@@ -75,9 +75,10 @@ class DvrLocalRouter(dvr_router_base.DvrRouterBase):
75 75
         dnat_from_floatingip_to_fixedip = (
76 76
             'PREROUTING', '-d %s/32 -i %s -j DNAT --to-destination %s' % (
77 77
                 floating_ip, rtr_2_fip_name, fixed_ip))
78
-        snat_from_fixedip_to_floatingip = (
79
-            'float-snat', '-s %s/32 -j SNAT --to-source %s' % (
80
-                fixed_ip, floating_ip))
78
+        to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
79
+        if self.iptables_manager.random_fully:
80
+            to_source += ' --random-fully'
81
+        snat_from_fixedip_to_floatingip = ('float-snat', to_source)
81 82
         return [dnat_from_floatingip_to_fixedip,
82 83
                 snat_from_fixedip_to_floatingip]
83 84
 

+ 15
- 11
neutron/agent/l3/router_info.py View File

@@ -173,12 +173,14 @@ class RouterInfo(object):
173 173
     def floating_forward_rules(self, fip):
174 174
         fixed_ip = fip['fixed_ip_address']
175 175
         floating_ip = fip['floating_ip_address']
176
+        to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
177
+        if self.iptables_manager.random_fully:
178
+            to_source += ' --random-fully'
176 179
         return [('PREROUTING', '-d %s/32 -j DNAT --to-destination %s' %
177 180
                  (floating_ip, fixed_ip)),
178 181
                 ('OUTPUT', '-d %s/32 -j DNAT --to-destination %s' %
179 182
                  (floating_ip, fixed_ip)),
180
-                ('float-snat', '-s %s/32 -j SNAT --to-source %s' %
181
-                 (fixed_ip, floating_ip))]
183
+                ('float-snat', to_source)]
182 184
 
183 185
     def floating_mangle_rules(self, floating_ip, fixed_ip, internal_mark):
184 186
         mark_traffic_to_floating_ip = (
@@ -854,19 +856,21 @@ class RouterInfo(object):
854 856
             self._prevent_snat_for_internal_traffic_rule(interface_name))
855 857
         # Makes replies come back through the router to reverse DNAT
856 858
         ext_in_mark = self.agent_conf.external_ingress_mark
857
-        snat_internal_traffic_to_floating_ip = (
858
-            'snat', '-m mark ! --mark %s/%s '
859
-                    '-m conntrack --ctstate DNAT '
860
-                    '-j SNAT --to-source %s'
861
-                    % (ext_in_mark, n_const.ROUTER_MARK_MASK, ex_gw_ip))
859
+        to_source = ('-m mark ! --mark %s/%s '
860
+                     '-m conntrack --ctstate DNAT '
861
+                     '-j SNAT --to-source %s'
862
+                     % (ext_in_mark, lib_constants.ROUTER_MARK_MASK, ex_gw_ip))
863
+        if self.iptables_manager.random_fully:
864
+            to_source += ' --random-fully'
865
+        snat_internal_traffic_to_floating_ip = ('snat', to_source)
862 866
         return [dont_snat_traffic_to_internal_ports_if_not_to_floating_ip,
863 867
                 snat_internal_traffic_to_floating_ip]
864 868
 
865 869
     def external_gateway_nat_snat_rules(self, ex_gw_ip, interface_name):
866
-        snat_normal_external_traffic = (
867
-            'snat', '-o %s -j SNAT --to-source %s' %
868
-                    (interface_name, ex_gw_ip))
869
-        return [snat_normal_external_traffic]
870
+        to_source = '-o %s -j SNAT --to-source %s' % (interface_name, ex_gw_ip)
871
+        if self.iptables_manager.random_fully:
872
+            to_source += ' --random-fully'
873
+        return [('snat', to_source)]
870 874
 
871 875
     def external_gateway_mangle_rules(self, interface_name):
872 876
         mark = self.agent_conf.external_ingress_mark

+ 21
- 0
neutron/agent/linux/iptables_manager.py View File

@@ -38,6 +38,7 @@ from neutron.agent.linux import ip_lib
38 38
 from neutron.agent.linux import iptables_comments as ic
39 39
 from neutron.agent.linux import utils as linux_utils
40 40
 from neutron.common import constants
41
+from neutron.common import utils
41 42
 from neutron.conf.agent import common as config
42 43
 
43 44
 LOG = logging.getLogger(__name__)
@@ -302,6 +303,9 @@ class IptablesManager(object):
302 303
     # run iptables-restore without it.
303 304
     use_table_lock = False
304 305
 
306
+    # Flag to denote iptables supports --random-fully argument
307
+    _random_fully = None
308
+
305 309
     def __init__(self, _execute=None, state_less=False, use_ipv6=False,
306 310
                  namespace=None, binary_name=binary_name):
307 311
         if _execute:
@@ -473,6 +477,23 @@ class IptablesManager(object):
473 477
             args = ['ip', 'netns', 'exec', self.namespace] + args
474 478
         return self.execute(args, run_as_root=True).split('\n')
475 479
 
480
+    def _get_version(self):
481
+        # Output example is "iptables v1.6.2"
482
+        args = ['iptables', '--version']
483
+        version = str(self.execute(args, run_as_root=True).split()[1][1:])
484
+        LOG.debug("IPTables version installed: %s", version)
485
+        return version
486
+
487
+    @property
488
+    def random_fully(self):
489
+        if self._random_fully is not None:
490
+            return self._random_fully
491
+
492
+        version = self._get_version()
493
+        self.__class__._random_fully = utils.is_version_greater_equal(
494
+            version, constants.IPTABLES_RANDOM_FULLY_VERSION)
495
+        return self._random_fully
496
+
476 497
     @property
477 498
     def xlock_wait_time(self):
478 499
         # give agent some time to report back to server

+ 4
- 0
neutron/common/constants.py View File

@@ -290,3 +290,7 @@ AGENT_RES_PROCESSING_STEP = 100
290 290
 # Number of resources for neutron to divide the large RPC
291 291
 # call data sets.
292 292
 RPC_RES_PROCESSING_STEP = 20
293
+
294
+# IPtables version to support --random-fully option.
295
+# Do not move this constant to neutron-lib, since it is temporary
296
+IPTABLES_RANDOM_FULLY_VERSION = '1.6.2'

+ 7
- 0
neutron/common/utils.py View File

@@ -41,6 +41,7 @@ from oslo_config import cfg
41 41
 from oslo_db import exception as db_exc
42 42
 from oslo_log import log as logging
43 43
 from oslo_utils import excutils
44
+import pkg_resources
44 45
 import six
45 46
 
46 47
 import neutron
@@ -312,6 +313,12 @@ def get_socket_address_family(ip_version):
312 313
                 else socket.AF_INET6))
313 314
 
314 315
 
316
+def is_version_greater_equal(version1, version2):
317
+    """Returns True if version1 is greater or equal than version2 else False"""
318
+    return (pkg_resources.parse_version(version1) >=
319
+            pkg_resources.parse_version(version2))
320
+
321
+
315 322
 class DelayedStringRenderer(object):
316 323
     """Takes a callable and its args and calls when __str__ is called
317 324
 

+ 50
- 11
neutron/tests/unit/agent/l3/test_agent.py View File

@@ -18,6 +18,7 @@ from itertools import chain as iter_chain
18 18
 from itertools import combinations as iter_combinations
19 19
 
20 20
 import eventlet
21
+import fixtures
21 22
 import mock
22 23
 import netaddr
23 24
 from neutron_lib.agent import constants as agent_consts
@@ -193,10 +194,28 @@ class BasicRouterOperationsFramework(base.BaseTestCase):
193 194
         ri.process()
194 195
 
195 196
 
197
+class IptablesFixture(fixtures.Fixture):
198
+    def _setUp(self):
199
+        # We MUST save and restore random_fully because it is a class
200
+        # attribute and could change state in some tests, which can cause
201
+        # the other router test cases to randomly fail due to race conditions.
202
+        self.random_fully = iptables_manager.IptablesManager.random_fully
203
+        iptables_manager.IptablesManager.random_fully = True
204
+        self.addCleanup(self._reset)
205
+
206
+    def _reset(self):
207
+        iptables_manager.IptablesManager.random_fully = self.random_fully
208
+
209
+
196 210
 class TestBasicRouterOperations(BasicRouterOperationsFramework):
211
+    def setUp(self):
212
+        super(TestBasicRouterOperations, self).setUp()
213
+        self.useFixture(IptablesFixture())
214
+
197 215
     def test_request_id_changes(self):
198 216
         a = l3_agent.L3NATAgent(HOSTNAME, self.conf)
199 217
         self.assertNotEqual(a.context.request_id, a.context.request_id)
218
+        self.useFixture(IptablesFixture())
200 219
 
201 220
     def test_init_ha_conf(self):
202 221
         with mock.patch('os.path.dirname', return_value='/etc/ha/'):
@@ -1023,7 +1042,7 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1023 1042
         self._test_external_gateway_action('remove', router, dual_stack=True)
1024 1043
 
1025 1044
     def _verify_snat_mangle_rules(self, nat_rules, mangle_rules, router,
1026
-                                  negate=False):
1045
+                                  random_fully, negate=False):
1027 1046
         interfaces = router[lib_constants.INTERFACE_KEY]
1028 1047
         source_cidrs = []
1029 1048
         for iface in interfaces:
@@ -1034,13 +1053,18 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1034 1053
                 source_cidrs.append(source_cidr)
1035 1054
         source_nat_ip = router['gw_port']['fixed_ips'][0]['ip_address']
1036 1055
         interface_name = ('qg-%s' % router['gw_port']['id'])[:14]
1056
+        mask_rule = ('-m mark ! --mark 0x2/%s -m conntrack --ctstate DNAT '
1057
+                     '-j SNAT --to-source %s' %
1058
+                     (lib_constants.ROUTER_MARK_MASK, source_nat_ip))
1059
+        snat_rule = ('-o %s -j SNAT --to-source %s' %
1060
+                     (interface_name, source_nat_ip))
1061
+        if random_fully:
1062
+            mask_rule += ' --random-fully'
1063
+            snat_rule += ' --random-fully'
1037 1064
         expected_rules = [
1038 1065
             '! -i %s ! -o %s -m conntrack ! --ctstate DNAT -j ACCEPT' %
1039 1066
             (interface_name, interface_name),
1040
-            '-o %s -j SNAT --to-source %s' % (interface_name, source_nat_ip),
1041
-            '-m mark ! --mark 0x2/%s -m conntrack --ctstate DNAT '
1042
-            '-j SNAT --to-source %s' %
1043
-            (n_const.ROUTER_MARK_MASK, source_nat_ip)]
1067
+            mask_rule, snat_rule]
1044 1068
         for r in nat_rules:
1045 1069
             if negate:
1046 1070
                 self.assertNotIn(r.rule, expected_rules)
@@ -1632,7 +1656,8 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1632 1656
         ri.get_external_device_name = mock.Mock(return_value='exgw')
1633 1657
         self._test_process_floating_ip_addresses_add(ri, agent)
1634 1658
 
1635
-    def test_process_router_snat_disabled(self):
1659
+    def _test_process_router_snat_disabled(self, random_fully):
1660
+        iptables_manager.IptablesManager.random_fully = random_fully
1636 1661
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
1637 1662
         router = l3_test_common.prepare_router_data(enable_snat=True)
1638 1663
         ri = l3router.RouterInfo(agent, router['id'], router, **self.ri_kwargs)
@@ -1656,10 +1681,17 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1656 1681
             if r not in ri.iptables_manager.ipv4['mangle'].rules]
1657 1682
         self.assertEqual(1, len(mangle_rules_delta))
1658 1683
         self._verify_snat_mangle_rules(nat_rules_delta, mangle_rules_delta,
1659
-                                       router)
1684
+                                       router, random_fully)
1660 1685
         self.assertEqual(1, self.send_adv_notif.call_count)
1661 1686
 
1662
-    def test_process_router_snat_enabled(self):
1687
+    def test_process_router_snat_disabled_random_fully(self):
1688
+        self._test_process_router_snat_disabled(True)
1689
+
1690
+    def test_process_router_snat_disabled_random_fully_false(self):
1691
+        self._test_process_router_snat_disabled(False)
1692
+
1693
+    def _test_process_router_snat_enabled(self, random_fully):
1694
+        iptables_manager.IptablesManager.random_fully = random_fully
1663 1695
         agent = l3_agent.L3NATAgent(HOSTNAME, self.conf)
1664 1696
         router = l3_test_common.prepare_router_data(enable_snat=False)
1665 1697
         ri = l3router.RouterInfo(agent, router['id'], router, **self.ri_kwargs)
@@ -1683,9 +1715,15 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
1683 1715
             if r not in orig_mangle_rules]
1684 1716
         self.assertEqual(1, len(mangle_rules_delta))
1685 1717
         self._verify_snat_mangle_rules(nat_rules_delta, mangle_rules_delta,
1686
-                                       router)
1718
+                                       router, random_fully)
1687 1719
         self.assertEqual(1, self.send_adv_notif.call_count)
1688 1720
 
1721
+    def test_process_router_snat_enabled_random_fully(self):
1722
+        self._test_process_router_snat_enabled(True)
1723
+
1724
+    def test_process_router_snat_enabled_random_fully_false(self):
1725
+        self._test_process_router_snat_enabled(False)
1726
+
1689 1727
     def _test_update_routing_table(self, is_snat_host=True):
1690 1728
         router = l3_test_common.prepare_router_data()
1691 1729
         uuid = router['id']
@@ -2293,11 +2331,12 @@ class TestBasicRouterOperations(BasicRouterOperationsFramework):
2293 2331
 
2294 2332
         jump_float_rule = "-A %s-snat -j %s-float-snat" % (wrap_name,
2295 2333
                                                            wrap_name)
2296
-        snat_rule1 = ("-A %s-snat -o iface -j SNAT --to-source %s") % (
2334
+        snat_rule1 = ("-A %s-snat -o iface -j SNAT --to-source %s "
2335
+                      "--random-fully") % (
2297 2336
             wrap_name, ex_gw_port['fixed_ips'][0]['ip_address'])
2298 2337
         snat_rule2 = ("-A %s-snat -m mark ! --mark 0x2/%s "
2299 2338
                       "-m conntrack --ctstate DNAT "
2300
-                      "-j SNAT --to-source %s") % (
2339
+                      "-j SNAT --to-source %s --random-fully") % (
2301 2340
             wrap_name, n_const.ROUTER_MARK_MASK,
2302 2341
             ex_gw_port['fixed_ips'][0]['ip_address'])
2303 2342
 

+ 5
- 3
neutron/tests/unit/agent/l3/test_dvr_local_router.py View File

@@ -260,9 +260,11 @@ class TestDvrRouterOperations(base.BaseTestCase):
260 260
         dnat_from_floatingip_to_fixedip = (
261 261
             'PREROUTING', '-d %s/32 -i %s -j DNAT --to-destination %s' % (
262 262
                 floating_ip, rtr_2_fip_name, fixed_ip))
263
-        snat_from_fixedip_to_floatingip = (
264
-            'float-snat', '-s %s/32 -j SNAT --to-source %s' % (
265
-                fixed_ip, floating_ip))
263
+        to_source = '-s %s/32 -j SNAT --to-source %s' % (fixed_ip, floating_ip)
264
+
265
+        if ri.iptables_manager.random_fully:
266
+            to_source += ' --random-fully'
267
+        snat_from_fixedip_to_floatingip = ('float-snat', to_source)
266 268
         actual = ri.floating_forward_rules(fip)
267 269
         expected = [dnat_from_floatingip_to_fixedip,
268 270
                     snat_from_fixedip_to_floatingip]

+ 11
- 0
neutron/tests/unit/common/test_utils.py View File

@@ -282,6 +282,17 @@ class TestIpVersionFromInt(base.BaseTestCase):
282 282
                           8)
283 283
 
284 284
 
285
+class TestIsVersionGreaterEqual(base.BaseTestCase):
286
+    def test_is_version_greater_equal_greater(self):
287
+        self.assertTrue(utils.is_version_greater_equal('1.6.2', '1.6.0'))
288
+
289
+    def test_is_version_greater_equal_equal(self):
290
+        self.assertTrue(utils.is_version_greater_equal('1.6.2', '1.6.2'))
291
+
292
+    def test_is_version_greater_equal_less(self):
293
+        self.assertFalse(utils.is_version_greater_equal('1.6.0', '1.6.2'))
294
+
295
+
285 296
 class TestDelayedStringRenderer(base.BaseTestCase):
286 297
     def test_call_deferred_until_str(self):
287 298
         my_func = mock.MagicMock(return_value='Brie cheese!')

Loading…
Cancel
Save