Disable keepalive if single backend is configured

Keepalive can pose an extra load on the backend, especially
when client spawn multiple processes. In addition, some
deployments are using external load balancer with its
own monitoring mechanism, in which case nsxlib probing is
redundant.
Thsi change suggests to avoid keepalive probing in case
only one backend is configured. If cluster is DOWN,
connection will always be retried upon API call.
Change-Id: If6b5542f0444f5bb72c0d60e90942a7819c5d72e
This commit is contained in:
Anna Khmelnitsky 2020-02-26 16:36:34 -08:00
parent 9812774970
commit 52322508db
4 changed files with 35 additions and 9 deletions

View File

@ -239,7 +239,8 @@ class NsxClientTestCase(NsxLibTestCase):
nsx_api_managers=nsx_api_managers or [NSX_MANAGER],
plugin_scope=PLUGIN_SCOPE,
plugin_tag=PLUGIN_TAG,
plugin_ver=PLUGIN_VER)
plugin_ver=PLUGIN_VER,
cluster_unavailable_retry=True)
super(NsxClientTestCase.MockNSXClusteredAPI, self).__init__(
nsxlib_config)

View File

@ -390,6 +390,7 @@ class ClusteredAPITestCase(nsxlib_testcase.NsxClientTestCase):
max_attempts = 3
api = self.mock_nsx_clustered_api(nsx_api_managers=conf_managers,
max_attempts=max_attempts)
api.nsxlib_config.cluster_unavailable_retry = False
api._validate = mock.Mock()
eps = list(api._endpoints.values())

View File

@ -457,6 +457,7 @@ class ClusteredAPI(object):
self._http_provider = http_provider
self._keepalive_interval = keepalive_interval
self._print_keepalive = 0
def _init_cluster(*args, **kwargs):
self._init_endpoints(providers,
@ -508,6 +509,11 @@ class ClusteredAPI(object):
break
eventlet.sleep(0.5)
if len(self._endpoints) > 1:
# We don't monitor connectivity when one endpoint is available,
# since there is no alternative to querying this single backend
# If endpoint was down, we can tolerate extra roundtrip to
# validate connectivity
for endpoint in self._endpoints.values():
# dynamic loop for each endpoint to ensure connectivity
loop = loopingcall.DynamicLoopingCall(
@ -523,6 +529,13 @@ class ClusteredAPI(object):
delta = datetime.datetime.now() - endpoint.last_updated
if delta.seconds >= self._keepalive_interval:
# TODO(boden): backoff on validation failure
if self._print_keepalive % 10 == 0:
# Print keepalive debug message once every 10 probes
LOG.debug("Running keepalive probe for cluster endpoint "
"'%(ep)s' ",
{'ep': endpoint})
self._print_keepalive += 1
self._validate(endpoint)
return self._keepalive_interval
return self._keepalive_interval - delta.seconds

View File

@ -85,7 +85,11 @@ class NsxLibConfig(object):
:param cluster_unavailable_retry: If True, skip fatal errors when no
endpoint in the NSX management cluster is
available to serve a request, and retry
the request instead.
the request instead. This setting can
not be False if single endpoint is
configured in the cluster, since there
will be no keepalive probes in this
case.
-- Additional parameters which are relevant only for the Policy manager:
:param allow_passthrough: If True, use nsx manager api for cases which are
@ -152,6 +156,13 @@ class NsxLibConfig(object):
self.realization_max_attempts = realization_max_attempts
self.realization_wait_sec = realization_wait_sec
if len(nsx_api_managers) == 1 and not self.cluster_unavailable_retry:
LOG.warning("When only one endpoint is provided, keepalive probes "
" are disabled. For the system to be able to recover "
" from DOWN state, cluster_unavailable_retry is set "
" to True, overriding provided configuration")
self.cluster_unavailable_retry = True
if dhcp_profile_uuid:
# this is deprecated, and never used.
versionutils.report_deprecated_feature(