From 52322508db5a54219ecbdfc0667a0cfd639b57aa Mon Sep 17 00:00:00 2001 From: Anna Khmelnitsky Date: Wed, 26 Feb 2020 16:36:34 -0800 Subject: [PATCH] Disable keepalive if single backend is configured Keepalive can pose an extra load on the backend, especially when client spawn multiple processes. In addition, some deployments are using external load balancer with its own monitoring mechanism, in which case nsxlib probing is redundant. Thsi change suggests to avoid keepalive probing in case only one backend is configured. If cluster is DOWN, connection will always be retried upon API call. Change-Id: If6b5542f0444f5bb72c0d60e90942a7819c5d72e --- .../tests/unit/v3/nsxlib_testcase.py | 3 ++- vmware_nsxlib/tests/unit/v3/test_cluster.py | 1 + vmware_nsxlib/v3/cluster.py | 27 ++++++++++++++----- vmware_nsxlib/v3/config.py | 13 ++++++++- 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py b/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py index f63a1266..27c01fe0 100644 --- a/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py +++ b/vmware_nsxlib/tests/unit/v3/nsxlib_testcase.py @@ -239,7 +239,8 @@ class NsxClientTestCase(NsxLibTestCase): nsx_api_managers=nsx_api_managers or [NSX_MANAGER], plugin_scope=PLUGIN_SCOPE, plugin_tag=PLUGIN_TAG, - plugin_ver=PLUGIN_VER) + plugin_ver=PLUGIN_VER, + cluster_unavailable_retry=True) super(NsxClientTestCase.MockNSXClusteredAPI, self).__init__( nsxlib_config) diff --git a/vmware_nsxlib/tests/unit/v3/test_cluster.py b/vmware_nsxlib/tests/unit/v3/test_cluster.py index 15e4ea5c..0034146f 100644 --- a/vmware_nsxlib/tests/unit/v3/test_cluster.py +++ b/vmware_nsxlib/tests/unit/v3/test_cluster.py @@ -390,6 +390,7 @@ class ClusteredAPITestCase(nsxlib_testcase.NsxClientTestCase): max_attempts = 3 api = self.mock_nsx_clustered_api(nsx_api_managers=conf_managers, max_attempts=max_attempts) + api.nsxlib_config.cluster_unavailable_retry = False api._validate = mock.Mock() eps = list(api._endpoints.values()) diff --git a/vmware_nsxlib/v3/cluster.py b/vmware_nsxlib/v3/cluster.py index ccf904ab..d8bc3b27 100644 --- a/vmware_nsxlib/v3/cluster.py +++ b/vmware_nsxlib/v3/cluster.py @@ -457,6 +457,7 @@ class ClusteredAPI(object): self._http_provider = http_provider self._keepalive_interval = keepalive_interval + self._print_keepalive = 0 def _init_cluster(*args, **kwargs): self._init_endpoints(providers, @@ -508,13 +509,18 @@ class ClusteredAPI(object): break eventlet.sleep(0.5) - for endpoint in self._endpoints.values(): - # dynamic loop for each endpoint to ensure connectivity - loop = loopingcall.DynamicLoopingCall( - self._endpoint_keepalive, endpoint) - loop.start(initial_delay=self._keepalive_interval, - periodic_interval_max=self._keepalive_interval, - stop_on_exception=False) + if len(self._endpoints) > 1: + # We don't monitor connectivity when one endpoint is available, + # since there is no alternative to querying this single backend + # If endpoint was down, we can tolerate extra roundtrip to + # validate connectivity + for endpoint in self._endpoints.values(): + # dynamic loop for each endpoint to ensure connectivity + loop = loopingcall.DynamicLoopingCall( + self._endpoint_keepalive, endpoint) + loop.start(initial_delay=self._keepalive_interval, + periodic_interval_max=self._keepalive_interval, + stop_on_exception=False) LOG.debug("Done initializing API endpoint(s). " "API cluster health: %s", self.health) @@ -523,6 +529,13 @@ class ClusteredAPI(object): delta = datetime.datetime.now() - endpoint.last_updated if delta.seconds >= self._keepalive_interval: # TODO(boden): backoff on validation failure + if self._print_keepalive % 10 == 0: + # Print keepalive debug message once every 10 probes + LOG.debug("Running keepalive probe for cluster endpoint " + "'%(ep)s' ", + {'ep': endpoint}) + self._print_keepalive += 1 + self._validate(endpoint) return self._keepalive_interval return self._keepalive_interval - delta.seconds diff --git a/vmware_nsxlib/v3/config.py b/vmware_nsxlib/v3/config.py index f0bc0b64..c6ba896e 100644 --- a/vmware_nsxlib/v3/config.py +++ b/vmware_nsxlib/v3/config.py @@ -85,7 +85,11 @@ class NsxLibConfig(object): :param cluster_unavailable_retry: If True, skip fatal errors when no endpoint in the NSX management cluster is available to serve a request, and retry - the request instead. + the request instead. This setting can + not be False if single endpoint is + configured in the cluster, since there + will be no keepalive probes in this + case. -- Additional parameters which are relevant only for the Policy manager: :param allow_passthrough: If True, use nsx manager api for cases which are @@ -152,6 +156,13 @@ class NsxLibConfig(object): self.realization_max_attempts = realization_max_attempts self.realization_wait_sec = realization_wait_sec + if len(nsx_api_managers) == 1 and not self.cluster_unavailable_retry: + LOG.warning("When only one endpoint is provided, keepalive probes " + " are disabled. For the system to be able to recover " + " from DOWN state, cluster_unavailable_retry is set " + " to True, overriding provided configuration") + self.cluster_unavailable_retry = True + if dhcp_profile_uuid: # this is deprecated, and never used. versionutils.report_deprecated_feature(