From 52322508db5a54219ecbdfc0667a0cfd639b57aa Mon Sep 17 00:00:00 2001
From: Anna Khmelnitsky <akhmelnitsky@vmware.com>
Date: Wed, 26 Feb 2020 16:36:34 -0800
Subject: [PATCH] Disable keepalive if single backend is configured

Keepalive can pose an extra load on the backend, especially
when client spawn multiple processes. In addition, some
deployments are using external load balancer with its
own monitoring mechanism, in which case nsxlib probing is
redundant.
Thsi change suggests to avoid keepalive probing in case
only one backend is configured. If cluster is DOWN,
connection will always be retried upon API call.
Change-Id: If6b5542f0444f5bb72c0d60e90942a7819c5d72e
---
 .../tests/unit/v3/nsxlib_testcase.py          |  3 ++-
 vmware_nsxlib/tests/unit/v3/test_cluster.py   |  1 +
 vmware_nsxlib/v3/cluster.py                   | 27 ++++++++++++++-----
 vmware_nsxlib/v3/config.py                    | 13 ++++++++-
 4 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py b/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py
index f63a1266..27c01fe0 100644
--- a/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py
+++ b/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py
@@ -239,7 +239,8 @@ class NsxClientTestCase(NsxLibTestCase):
                 nsx_api_managers=nsx_api_managers or [NSX_MANAGER],
                 plugin_scope=PLUGIN_SCOPE,
                 plugin_tag=PLUGIN_TAG,
-                plugin_ver=PLUGIN_VER)
+                plugin_ver=PLUGIN_VER,
+                cluster_unavailable_retry=True)
 
             super(NsxClientTestCase.MockNSXClusteredAPI, self).__init__(
                 nsxlib_config)
diff --git a/vmware_nsxlib/tests/unit/v3/test_cluster.py b/vmware_nsxlib/tests/unit/v3/test_cluster.py
index 15e4ea5c..0034146f 100644
--- a/vmware_nsxlib/tests/unit/v3/test_cluster.py
+++ b/vmware_nsxlib/tests/unit/v3/test_cluster.py
@@ -390,6 +390,7 @@ class ClusteredAPITestCase(nsxlib_testcase.NsxClientTestCase):
         max_attempts = 3
         api = self.mock_nsx_clustered_api(nsx_api_managers=conf_managers,
                                           max_attempts=max_attempts)
+        api.nsxlib_config.cluster_unavailable_retry = False
         api._validate = mock.Mock()
         eps = list(api._endpoints.values())
 
diff --git a/vmware_nsxlib/v3/cluster.py b/vmware_nsxlib/v3/cluster.py
index ccf904ab..d8bc3b27 100644
--- a/vmware_nsxlib/v3/cluster.py
+++ b/vmware_nsxlib/v3/cluster.py
@@ -457,6 +457,7 @@ class ClusteredAPI(object):
 
         self._http_provider = http_provider
         self._keepalive_interval = keepalive_interval
+        self._print_keepalive = 0
 
         def _init_cluster(*args, **kwargs):
             self._init_endpoints(providers,
@@ -508,13 +509,18 @@ class ClusteredAPI(object):
                 break
             eventlet.sleep(0.5)
 
-        for endpoint in self._endpoints.values():
-            # dynamic loop for each endpoint to ensure connectivity
-            loop = loopingcall.DynamicLoopingCall(
-                self._endpoint_keepalive, endpoint)
-            loop.start(initial_delay=self._keepalive_interval,
-                       periodic_interval_max=self._keepalive_interval,
-                       stop_on_exception=False)
+        if len(self._endpoints) > 1:
+            # We don't monitor connectivity when one endpoint is available,
+            # since there is no alternative to querying this single backend
+            # If endpoint was down, we can tolerate extra roundtrip to
+            # validate connectivity
+            for endpoint in self._endpoints.values():
+                # dynamic loop for each endpoint to ensure connectivity
+                loop = loopingcall.DynamicLoopingCall(
+                    self._endpoint_keepalive, endpoint)
+                loop.start(initial_delay=self._keepalive_interval,
+                           periodic_interval_max=self._keepalive_interval,
+                           stop_on_exception=False)
 
         LOG.debug("Done initializing API endpoint(s). "
                   "API cluster health: %s", self.health)
@@ -523,6 +529,13 @@ class ClusteredAPI(object):
         delta = datetime.datetime.now() - endpoint.last_updated
         if delta.seconds >= self._keepalive_interval:
             # TODO(boden): backoff on validation failure
+            if self._print_keepalive % 10 == 0:
+                # Print keepalive debug message once every 10 probes
+                LOG.debug("Running keepalive probe for cluster endpoint "
+                          "'%(ep)s' ",
+                          {'ep': endpoint})
+            self._print_keepalive += 1
+
             self._validate(endpoint)
             return self._keepalive_interval
         return self._keepalive_interval - delta.seconds
diff --git a/vmware_nsxlib/v3/config.py b/vmware_nsxlib/v3/config.py
index f0bc0b64..c6ba896e 100644
--- a/vmware_nsxlib/v3/config.py
+++ b/vmware_nsxlib/v3/config.py
@@ -85,7 +85,11 @@ class NsxLibConfig(object):
     :param cluster_unavailable_retry: If True, skip fatal errors when no
                                       endpoint in the NSX management cluster is
                                       available to serve a request, and retry
-                                      the request instead.
+                                      the request instead. This setting can
+                                      not be False if single endpoint is
+                                      configured in the cluster, since there
+                                      will be no keepalive probes in this
+                                      case.
 
     -- Additional parameters which are relevant only for the Policy manager:
     :param allow_passthrough: If True, use nsx manager api for cases which are
@@ -152,6 +156,13 @@ class NsxLibConfig(object):
         self.realization_max_attempts = realization_max_attempts
         self.realization_wait_sec = realization_wait_sec
 
+        if len(nsx_api_managers) == 1 and not self.cluster_unavailable_retry:
+            LOG.warning("When only one endpoint is provided, keepalive probes "
+                        " are disabled. For the system to be able to recover "
+                        " from DOWN state, cluster_unavailable_retry is set "
+                        " to True, overriding provided configuration")
+            self.cluster_unavailable_retry = True
+
         if dhcp_profile_uuid:
             # this is deprecated, and never used.
             versionutils.report_deprecated_feature(