Fix backoff mechanism

Right now, the backoff mechanism is broken when the backoff is
set to something non-zero.  Basically, you go into this state where
you retry ad infinitum, leading to inconsistent behavior.

This change fixes the mechanism so that you only get a fixed number
of retries.  You can choose (through a new config parameter) to allow
backoff (or not).

To restore some of the old behavior, the default for the connect_retries
parameter has been increased from 2 to 4, and the max backoff time has
been decreased from 1024 to 512 seconds.  Its unlikely that we'd ever
reach that backoff time without a large number of retries, but 1024
seems too long.

And there is a new exception that is thrown when the connection
fails.  This will result in nice 500 errors in the novajoin-server,
and some log messages for the notifier.

Change-Id: I10547fbde8966c8694346ed8c054e627bee2ee51
This commit is contained in:
Ade Lee 2019-08-14 18:41:10 -04:00
parent 2c8c868a2c
commit 6ed30c9476
5 changed files with 50 additions and 20 deletions

View File

@ -32,9 +32,14 @@ service_opts = [
help='Kerberos client keytab file'),
cfg.StrOpt('domain', default=None,
help='Domain for new hosts'),
cfg.IntOpt('connect_retries', default=2,
cfg.IntOpt('connect_retries', default=4,
help='How many times to attempt to retry '
'the connection to IPA before giving up'),
cfg.IntOpt('connect_backoff', default=0,
help='Initial number of seconds to backoff before '
'retrying the connection to IPA. The backoff '
'time is doubled after each retry. A value '
'of 0 disables backoff'),
cfg.BoolOpt('project_subdomain', default=False,
help='Treat the project as a DNS subdomain '
'so a hostname would take the form: '

View File

@ -153,3 +153,7 @@ class NotificationVersionMismatch(JoinException):
message = ("Provided notification version "
"%(provided_maj)s.%(provided_min)s did not match expected "
"%(expected_maj)s.%(expected_min)s for %(type)s")
class IPAConnectionError(JoinException):
message = "Unable to connect to IPA after %(tries) tries"

View File

@ -41,6 +41,7 @@ if ipalib_imported:
except ImportError:
ipalib_imported = False
from novajoin import exception
from novajoin.util import get_domain
from oslo_config import cfg
from oslo_log import log as logging
@ -54,13 +55,12 @@ LOG = logging.getLogger(__name__)
class IPANovaJoinBase(object):
def __init__(self, backoff=0):
try:
self.ntries = CONF.connect_retries
except cfg.NoSuchOptError:
self.ntries = 1
def __init__(self):
if not ipalib_imported:
return
self.ntries = CONF.connect_retries
self.initial_backoff = CONF.connect_backoff
self.ccache = "MEMORY:" + str(uuid.uuid4())
os.environ['KRB5CCNAME'] = self.ccache
if self._ipa_client_configured() and not api.isdone('finalize'):
@ -70,7 +70,7 @@ class IPANovaJoinBase(object):
api.bootstrap(context='novajoin')
api.finalize()
self.batch_args = list()
self.backoff = backoff
self.backoff = self.initial_backoff
def split_principal(self, principal):
"""Split a principal into its components. Copied from IPA 4.0.0"""
@ -129,16 +129,20 @@ class IPANovaJoinBase(object):
def __backoff(self):
LOG.debug("Backing off %s seconds", self.backoff)
time.sleep(self.backoff)
if self.backoff < 1024:
if self.backoff < 512:
self.backoff = self.backoff * 2
def __reset_backoff(self):
if self.backoff > self.initial_backoff:
LOG.debug("Resetting backoff to %d", self.initial_backoff)
self.backoff = self.initial_backoff
def __get_connection(self):
"""Make a connection to IPA or raise an error."""
tries = 0
while (tries <= self.ntries) or (self.backoff > 0):
if self.backoff == 0:
LOG.debug("Attempt %d of %d", tries, self.ntries)
while (tries <= self.ntries):
LOG.debug("Attempt %d of %d", tries, self.ntries)
if api.Backend.rpcclient.isconnected():
api.Backend.rpcclient.disconnect()
try:
@ -149,6 +153,7 @@ class IPANovaJoinBase(object):
except (errors.CCacheError,
errors.TicketExpired,
errors.KerberosError) as e:
tries += 1
LOG.debug("kinit again: %s", e)
# pylint: disable=no-member
try:
@ -158,9 +163,8 @@ class IPANovaJoinBase(object):
self.ccache)
except GSSError as e:
LOG.debug("kinit failed: %s", e)
if tries > 0 and self.backoff:
self.__backoff()
tries += 1
if self.backoff:
self.__backoff()
except errors.NetworkError:
tries += 1
if self.backoff:
@ -173,8 +177,13 @@ class IPANovaJoinBase(object):
if self.backoff:
self.__backoff()
else:
# successful connection
self.__reset_backoff()
return
LOG.error("Failed to connect to IPA after %d attempts", self.ntries)
raise exception.IPAConnectionError(tries=self.ntries)
def start_batch_operation(self):
"""Start a batch operation.

View File

@ -41,11 +41,9 @@ CONF = config.CONF
LOG = logging.getLogger(__name__)
BACKOFF = 2
def ipaclient():
return IPAClient(backoff=BACKOFF)
return IPAClient()
def novaclient():
@ -194,9 +192,14 @@ class NotificationEndpoint(object):
return
LOG.info("Delete host %s (%s)", instance_id, hostname)
ipa = ipaclient()
ipa.delete_host(hostname, {})
self.delete_subhosts(ipa, hostname_short, payload_metadata)
try:
ipa = ipaclient()
ipa.delete_host(hostname, {})
self.delete_subhosts(ipa, hostname_short, payload_metadata)
except exception.IPAConnectionError:
LOG.error("IPA Connection Error when deleting host %s (%s). "
"Manual cleanup may be required in the IPA server.",
instance_id, hostname)
@event_handlers('network.floating_ip.associate')
def floaitng_ip_associate(self, payload):

View File

@ -0,0 +1,9 @@
---
fixes:
- Fix the retry backoff mechanism. With the current mechanism, if a backoff
is set, we will retry ad infinitum, leading to inconsistent results when
a deletion can happen after the instance as been recreated. With the new
mechanism, retries will occur a maximum number of times, with or without
backoff. A new parameter (connect_backoff, default=0) is added to specify
the initial backoff duration. Also, the default for the parameter for the
number of retries (connect_retries) has been increased from 2 to 4.