From 5d705468de1e495639f8b87266ccfc9391ce6135 Mon Sep 17 00:00:00 2001
From: LIU Yulong <liuyul@chinatelecom.cn>
Date: Thu, 21 Feb 2019 16:39:50 +0800
Subject: [PATCH] Do not call update_device_list in large sets

Ovs-agent can process the ports in large sets, then all
of these ports will have to update DB status or attributes.
But neutron server is centralized. It may have to do
something else, or the database processing can be also
time-consuming. Because of these, it sometimes returns
the RPC timeout exception to ovs-agent. And a fullsync
will be triggered in next rpc loop. The restart time is
becoming longer and longer.

Adds a default step to update the port to reduce
the probability of RPC timeout.

Related-Bug: #1813703
Related-Bug: #1813704
Related-Bug: #1813706
Related-Bug: #1813707

Change-Id: Ie37f4a4869969e235ce16b73cdfcbdc98626823e
(cherry picked from commit 8408af4f173a0ffde354599e26c49bf9e17e8bef)
(cherry picked from commit d7d30ea950844f11348fa2827908622e3a8c7dfb)
---
 neutron/agent/rpc.py                       | 29 +++++++++++++++++++---
 neutron/common/constants.py                |  3 +++
 neutron/tests/unit/plugins/ml2/test_rpc.py | 10 +++++++-
 3 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/neutron/agent/rpc.py b/neutron/agent/rpc.py
index bf76aca0114..130b18e1d2f 100644
--- a/neutron/agent/rpc.py
+++ b/neutron/agent/rpc.py
@@ -151,10 +151,31 @@ class PluginApi(object):
     def update_device_list(self, context, devices_up, devices_down,
                            agent_id, host, agent_restarted=False):
         cctxt = self.client.prepare(version='1.5')
-        return cctxt.call(context, 'update_device_list',
-                          devices_up=devices_up, devices_down=devices_down,
-                          agent_id=agent_id, host=host,
-                          agent_restarted=agent_restarted)
+
+        ret_devices_up = []
+        failed_devices_up = []
+        ret_devices_down = []
+        failed_devices_down = []
+
+        step = n_const.RPC_RES_PROCESSING_STEP
+        devices_up = list(devices_up)
+        devices_down = list(devices_down)
+        for i in range(0, max(len(devices_up), len(devices_down)), step):
+            # Divide-and-conquer RPC timeout
+            ret = cctxt.call(context, 'update_device_list',
+                             devices_up=devices_up[i:i + step],
+                             devices_down=devices_down[i:i + step],
+                             agent_id=agent_id, host=host,
+                             agent_restarted=agent_restarted)
+            ret_devices_up.extend(ret.get("devices_up", []))
+            failed_devices_up.extend(ret.get("failed_devices_up", []))
+            ret_devices_down.extend(ret.get("devices_down", []))
+            failed_devices_down.extend(ret.get("failed_devices_down", []))
+
+        return {'devices_up': ret_devices_up,
+                'failed_devices_up': failed_devices_up,
+                'devices_down': ret_devices_down,
+                'failed_devices_down': failed_devices_down}
 
     def tunnel_sync(self, context, tunnel_ip, tunnel_type=None, host=None):
         cctxt = self.client.prepare(version='1.4')
diff --git a/neutron/common/constants.py b/neutron/common/constants.py
index 8b98be69a78..07ca7caa905 100644
--- a/neutron/common/constants.py
+++ b/neutron/common/constants.py
@@ -265,3 +265,6 @@ EXT_PARENT_PREFIX = 'ext_parent'
 # for the restart success rate.
 # [1] http://paste.openstack.org/show/745685/
 AGENT_RES_PROCESSING_STEP = 100
+# Number of resources for neutron to divide the large RPC
+# call data sets.
+RPC_RES_PROCESSING_STEP = 20
diff --git a/neutron/tests/unit/plugins/ml2/test_rpc.py b/neutron/tests/unit/plugins/ml2/test_rpc.py
index 850b29db305..48f5cedf05f 100644
--- a/neutron/tests/unit/plugins/ml2/test_rpc.py
+++ b/neutron/tests/unit/plugins/ml2/test_rpc.py
@@ -327,9 +327,17 @@ class RpcCallbacksTestCase(base.BaseTestCase):
 class RpcApiTestCase(base.BaseTestCase):
 
     def _test_rpc_api(self, rpcapi, topic, method, rpc_method, **kwargs):
+        if method == "update_device_list":
+            expected = {'devices_up': [],
+                        'failed_devices_up': [],
+                        'devices_down': [],
+                        'failed_devices_down': []}
+        else:
+            expected = 'foo'
+
         ctxt = oslo_context.RequestContext(user_id='fake_user',
                                            tenant='fake_project')
-        expected_retval = 'foo' if rpc_method == 'call' else None
+        expected_retval = expected if rpc_method == 'call' else None
         expected_version = kwargs.pop('version', None)
         fanout = kwargs.pop('fanout', False)