Merge "Disable keepalive if single backend is configured"

2020-02-29 20:31:00 +00:00 · 2020-02-29 20:31:00 +00:00 · 505996439f
parent a4451a7fe0 52322508db
commit 505996439f
4 changed files with 35 additions and 9 deletions
--- a/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py
+++ b/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py
@ -239,7 +239,8 @@ class NsxClientTestCase(NsxLibTestCase):
                nsx_api_managers=nsx_api_managers or [NSX_MANAGER],
                plugin_scope=PLUGIN_SCOPE,
                plugin_tag=PLUGIN_TAG,
-                plugin_ver=PLUGIN_VER)
+                plugin_ver=PLUGIN_VER,
+                cluster_unavailable_retry=True)

            super(NsxClientTestCase.MockNSXClusteredAPI, self).__init__(
                nsxlib_config)
--- a/vmware_nsxlib/tests/unit/v3/test_cluster.py
+++ b/vmware_nsxlib/tests/unit/v3/test_cluster.py
@ -392,6 +392,7 @@ class ClusteredAPITestCase(nsxlib_testcase.NsxClientTestCase):
        max_attempts = 3
        api = self.mock_nsx_clustered_api(nsx_api_managers=conf_managers,
                                          max_attempts=max_attempts)
+        api.nsxlib_config.cluster_unavailable_retry = False
        api._validate = mock.Mock()
        eps = list(api._endpoints.values())

--- a/vmware_nsxlib/v3/cluster.py
+++ b/vmware_nsxlib/v3/cluster.py
@ -460,6 +460,7 @@ class ClusteredAPI(object):

        self._http_provider = http_provider
        self._keepalive_interval = keepalive_interval
+        self._print_keepalive = 0

        def _init_cluster(*args, **kwargs):
            self._init_endpoints(providers,
@ -511,13 +512,18 @@ class ClusteredAPI(object):
                break
            eventlet.sleep(0.5)

-        for endpoint in self._endpoints.values():
-            # dynamic loop for each endpoint to ensure connectivity
-            loop = loopingcall.DynamicLoopingCall(
-                self._endpoint_keepalive, endpoint)
-            loop.start(initial_delay=self._keepalive_interval,
-                       periodic_interval_max=self._keepalive_interval,
-                       stop_on_exception=False)
+        if len(self._endpoints) > 1:
+            # We don't monitor connectivity when one endpoint is available,
+            # since there is no alternative to querying this single backend
+            # If endpoint was down, we can tolerate extra roundtrip to
+            # validate connectivity
+            for endpoint in self._endpoints.values():
+                # dynamic loop for each endpoint to ensure connectivity
+                loop = loopingcall.DynamicLoopingCall(
+                    self._endpoint_keepalive, endpoint)
+                loop.start(initial_delay=self._keepalive_interval,
+                           periodic_interval_max=self._keepalive_interval,
+                           stop_on_exception=False)

        LOG.debug("Done initializing API endpoint(s). "
                  "API cluster health: %s", self.health)
@ -526,6 +532,13 @@ class ClusteredAPI(object):
        delta = datetime.datetime.now() - endpoint.last_updated
        if delta.seconds >= self._keepalive_interval:
            # TODO(boden): backoff on validation failure
+            if self._print_keepalive % 10 == 0:
+                # Print keepalive debug message once every 10 probes
+                LOG.debug("Running keepalive probe for cluster endpoint "
+                          "'%(ep)s' ",
+                          {'ep': endpoint})
+            self._print_keepalive += 1
+
            self._validate(endpoint)
            return self._keepalive_interval
        return self._keepalive_interval - delta.seconds
--- a/vmware_nsxlib/v3/config.py
+++ b/vmware_nsxlib/v3/config.py
@ -85,7 +85,11 @@ class NsxLibConfig(object):
    :param cluster_unavailable_retry: If True, skip fatal errors when no
                                      endpoint in the NSX management cluster is
                                      available to serve a request, and retry
-                                      the request instead.
+                                      the request instead. This setting can
+                                      not be False if single endpoint is
+                                      configured in the cluster, since there
+                                      will be no keepalive probes in this
+                                      case.

    -- Additional parameters which are relevant only for the Policy manager:
    :param allow_passthrough: If True, use nsx manager api for cases which are
@ -152,6 +156,13 @@ class NsxLibConfig(object):
        self.realization_max_attempts = realization_max_attempts
        self.realization_wait_sec = realization_wait_sec

+        if len(nsx_api_managers) == 1 and not self.cluster_unavailable_retry:
+            LOG.warning("When only one endpoint is provided, keepalive probes "
+                        " are disabled. For the system to be able to recover "
+                        " from DOWN state, cluster_unavailable_retry is set "
+                        " to True, overriding provided configuration")
+            self.cluster_unavailable_retry = True
+
        if dhcp_profile_uuid:
            # this is deprecated, and never used.
            versionutils.report_deprecated_feature(