Fix backoff mechanism
Right now, the backoff mechanism is broken when the backoff is set to something non-zero. Basically, you go into this state where you retry ad infinitum, leading to inconsistent behavior. This change fixes the mechanism so that you only get a fixed number of retries. You can choose (through a new config parameter) to allow backoff (or not). To restore some of the old behavior, the default for the connect_retries parameter has been increased from 2 to 4, and the max backoff time has been decreased from 1024 to 512 seconds. Its unlikely that we'd ever reach that backoff time without a large number of retries, but 1024 seems too long. And there is a new exception that is thrown when the connection fails. This will result in nice 500 errors in the novajoin-server, and some log messages for the notifier. Change-Id: I10547fbde8966c8694346ed8c054e627bee2ee51
This commit is contained in:
parent
2c8c868a2c
commit
6ed30c9476
@ -32,9 +32,14 @@ service_opts = [
|
||||
help='Kerberos client keytab file'),
|
||||
cfg.StrOpt('domain', default=None,
|
||||
help='Domain for new hosts'),
|
||||
cfg.IntOpt('connect_retries', default=2,
|
||||
cfg.IntOpt('connect_retries', default=4,
|
||||
help='How many times to attempt to retry '
|
||||
'the connection to IPA before giving up'),
|
||||
cfg.IntOpt('connect_backoff', default=0,
|
||||
help='Initial number of seconds to backoff before '
|
||||
'retrying the connection to IPA. The backoff '
|
||||
'time is doubled after each retry. A value '
|
||||
'of 0 disables backoff'),
|
||||
cfg.BoolOpt('project_subdomain', default=False,
|
||||
help='Treat the project as a DNS subdomain '
|
||||
'so a hostname would take the form: '
|
||||
|
@ -153,3 +153,7 @@ class NotificationVersionMismatch(JoinException):
|
||||
message = ("Provided notification version "
|
||||
"%(provided_maj)s.%(provided_min)s did not match expected "
|
||||
"%(expected_maj)s.%(expected_min)s for %(type)s")
|
||||
|
||||
|
||||
class IPAConnectionError(JoinException):
|
||||
message = "Unable to connect to IPA after %(tries) tries"
|
||||
|
@ -41,6 +41,7 @@ if ipalib_imported:
|
||||
except ImportError:
|
||||
ipalib_imported = False
|
||||
|
||||
from novajoin import exception
|
||||
from novajoin.util import get_domain
|
||||
from oslo_config import cfg
|
||||
from oslo_log import log as logging
|
||||
@ -54,13 +55,12 @@ LOG = logging.getLogger(__name__)
|
||||
|
||||
class IPANovaJoinBase(object):
|
||||
|
||||
def __init__(self, backoff=0):
|
||||
try:
|
||||
self.ntries = CONF.connect_retries
|
||||
except cfg.NoSuchOptError:
|
||||
self.ntries = 1
|
||||
def __init__(self):
|
||||
if not ipalib_imported:
|
||||
return
|
||||
|
||||
self.ntries = CONF.connect_retries
|
||||
self.initial_backoff = CONF.connect_backoff
|
||||
self.ccache = "MEMORY:" + str(uuid.uuid4())
|
||||
os.environ['KRB5CCNAME'] = self.ccache
|
||||
if self._ipa_client_configured() and not api.isdone('finalize'):
|
||||
@ -70,7 +70,7 @@ class IPANovaJoinBase(object):
|
||||
api.bootstrap(context='novajoin')
|
||||
api.finalize()
|
||||
self.batch_args = list()
|
||||
self.backoff = backoff
|
||||
self.backoff = self.initial_backoff
|
||||
|
||||
def split_principal(self, principal):
|
||||
"""Split a principal into its components. Copied from IPA 4.0.0"""
|
||||
@ -129,15 +129,19 @@ class IPANovaJoinBase(object):
|
||||
def __backoff(self):
|
||||
LOG.debug("Backing off %s seconds", self.backoff)
|
||||
time.sleep(self.backoff)
|
||||
if self.backoff < 1024:
|
||||
if self.backoff < 512:
|
||||
self.backoff = self.backoff * 2
|
||||
|
||||
def __reset_backoff(self):
|
||||
if self.backoff > self.initial_backoff:
|
||||
LOG.debug("Resetting backoff to %d", self.initial_backoff)
|
||||
self.backoff = self.initial_backoff
|
||||
|
||||
def __get_connection(self):
|
||||
"""Make a connection to IPA or raise an error."""
|
||||
tries = 0
|
||||
|
||||
while (tries <= self.ntries) or (self.backoff > 0):
|
||||
if self.backoff == 0:
|
||||
while (tries <= self.ntries):
|
||||
LOG.debug("Attempt %d of %d", tries, self.ntries)
|
||||
if api.Backend.rpcclient.isconnected():
|
||||
api.Backend.rpcclient.disconnect()
|
||||
@ -149,6 +153,7 @@ class IPANovaJoinBase(object):
|
||||
except (errors.CCacheError,
|
||||
errors.TicketExpired,
|
||||
errors.KerberosError) as e:
|
||||
tries += 1
|
||||
LOG.debug("kinit again: %s", e)
|
||||
# pylint: disable=no-member
|
||||
try:
|
||||
@ -158,9 +163,8 @@ class IPANovaJoinBase(object):
|
||||
self.ccache)
|
||||
except GSSError as e:
|
||||
LOG.debug("kinit failed: %s", e)
|
||||
if tries > 0 and self.backoff:
|
||||
if self.backoff:
|
||||
self.__backoff()
|
||||
tries += 1
|
||||
except errors.NetworkError:
|
||||
tries += 1
|
||||
if self.backoff:
|
||||
@ -173,8 +177,13 @@ class IPANovaJoinBase(object):
|
||||
if self.backoff:
|
||||
self.__backoff()
|
||||
else:
|
||||
# successful connection
|
||||
self.__reset_backoff()
|
||||
return
|
||||
|
||||
LOG.error("Failed to connect to IPA after %d attempts", self.ntries)
|
||||
raise exception.IPAConnectionError(tries=self.ntries)
|
||||
|
||||
def start_batch_operation(self):
|
||||
"""Start a batch operation.
|
||||
|
||||
|
@ -41,11 +41,9 @@ CONF = config.CONF
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
BACKOFF = 2
|
||||
|
||||
|
||||
def ipaclient():
|
||||
return IPAClient(backoff=BACKOFF)
|
||||
return IPAClient()
|
||||
|
||||
|
||||
def novaclient():
|
||||
@ -194,9 +192,14 @@ class NotificationEndpoint(object):
|
||||
return
|
||||
|
||||
LOG.info("Delete host %s (%s)", instance_id, hostname)
|
||||
try:
|
||||
ipa = ipaclient()
|
||||
ipa.delete_host(hostname, {})
|
||||
self.delete_subhosts(ipa, hostname_short, payload_metadata)
|
||||
except exception.IPAConnectionError:
|
||||
LOG.error("IPA Connection Error when deleting host %s (%s). "
|
||||
"Manual cleanup may be required in the IPA server.",
|
||||
instance_id, hostname)
|
||||
|
||||
@event_handlers('network.floating_ip.associate')
|
||||
def floaitng_ip_associate(self, payload):
|
||||
|
@ -0,0 +1,9 @@
|
||||
---
|
||||
fixes:
|
||||
- Fix the retry backoff mechanism. With the current mechanism, if a backoff
|
||||
is set, we will retry ad infinitum, leading to inconsistent results when
|
||||
a deletion can happen after the instance as been recreated. With the new
|
||||
mechanism, retries will occur a maximum number of times, with or without
|
||||
backoff. A new parameter (connect_backoff, default=0) is added to specify
|
||||
the initial backoff duration. Also, the default for the parameter for the
|
||||
number of retries (connect_retries) has been increased from 2 to 4.
|
Loading…
Reference in New Issue
Block a user