Fix backoff mechanism
Right now, the backoff mechanism is broken when the backoff is set to something non-zero. Basically, you go into this state where you retry ad infinitum, leading to inconsistent behavior. This change fixes the mechanism so that you only get a fixed number of retries. You can choose (through a new config parameter) to allow backoff (or not). To restore some of the old behavior, the default for the connect_retries parameter has been increased from 2 to 4, and the max backoff time has been decreased from 1024 to 512 seconds. Its unlikely that we'd ever reach that backoff time without a large number of retries, but 1024 seems too long. And there is a new exception that is thrown when the connection fails. This will result in nice 500 errors in the novajoin-server, and some log messages for the notifier. Change-Id: I10547fbde8966c8694346ed8c054e627bee2ee51
This commit is contained in:
parent
2c8c868a2c
commit
6ed30c9476
@ -32,9 +32,14 @@ service_opts = [
|
|||||||
help='Kerberos client keytab file'),
|
help='Kerberos client keytab file'),
|
||||||
cfg.StrOpt('domain', default=None,
|
cfg.StrOpt('domain', default=None,
|
||||||
help='Domain for new hosts'),
|
help='Domain for new hosts'),
|
||||||
cfg.IntOpt('connect_retries', default=2,
|
cfg.IntOpt('connect_retries', default=4,
|
||||||
help='How many times to attempt to retry '
|
help='How many times to attempt to retry '
|
||||||
'the connection to IPA before giving up'),
|
'the connection to IPA before giving up'),
|
||||||
|
cfg.IntOpt('connect_backoff', default=0,
|
||||||
|
help='Initial number of seconds to backoff before '
|
||||||
|
'retrying the connection to IPA. The backoff '
|
||||||
|
'time is doubled after each retry. A value '
|
||||||
|
'of 0 disables backoff'),
|
||||||
cfg.BoolOpt('project_subdomain', default=False,
|
cfg.BoolOpt('project_subdomain', default=False,
|
||||||
help='Treat the project as a DNS subdomain '
|
help='Treat the project as a DNS subdomain '
|
||||||
'so a hostname would take the form: '
|
'so a hostname would take the form: '
|
||||||
|
@ -153,3 +153,7 @@ class NotificationVersionMismatch(JoinException):
|
|||||||
message = ("Provided notification version "
|
message = ("Provided notification version "
|
||||||
"%(provided_maj)s.%(provided_min)s did not match expected "
|
"%(provided_maj)s.%(provided_min)s did not match expected "
|
||||||
"%(expected_maj)s.%(expected_min)s for %(type)s")
|
"%(expected_maj)s.%(expected_min)s for %(type)s")
|
||||||
|
|
||||||
|
|
||||||
|
class IPAConnectionError(JoinException):
|
||||||
|
message = "Unable to connect to IPA after %(tries) tries"
|
||||||
|
@ -41,6 +41,7 @@ if ipalib_imported:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
ipalib_imported = False
|
ipalib_imported = False
|
||||||
|
|
||||||
|
from novajoin import exception
|
||||||
from novajoin.util import get_domain
|
from novajoin.util import get_domain
|
||||||
from oslo_config import cfg
|
from oslo_config import cfg
|
||||||
from oslo_log import log as logging
|
from oslo_log import log as logging
|
||||||
@ -54,13 +55,12 @@ LOG = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class IPANovaJoinBase(object):
|
class IPANovaJoinBase(object):
|
||||||
|
|
||||||
def __init__(self, backoff=0):
|
def __init__(self):
|
||||||
try:
|
|
||||||
self.ntries = CONF.connect_retries
|
|
||||||
except cfg.NoSuchOptError:
|
|
||||||
self.ntries = 1
|
|
||||||
if not ipalib_imported:
|
if not ipalib_imported:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
self.ntries = CONF.connect_retries
|
||||||
|
self.initial_backoff = CONF.connect_backoff
|
||||||
self.ccache = "MEMORY:" + str(uuid.uuid4())
|
self.ccache = "MEMORY:" + str(uuid.uuid4())
|
||||||
os.environ['KRB5CCNAME'] = self.ccache
|
os.environ['KRB5CCNAME'] = self.ccache
|
||||||
if self._ipa_client_configured() and not api.isdone('finalize'):
|
if self._ipa_client_configured() and not api.isdone('finalize'):
|
||||||
@ -70,7 +70,7 @@ class IPANovaJoinBase(object):
|
|||||||
api.bootstrap(context='novajoin')
|
api.bootstrap(context='novajoin')
|
||||||
api.finalize()
|
api.finalize()
|
||||||
self.batch_args = list()
|
self.batch_args = list()
|
||||||
self.backoff = backoff
|
self.backoff = self.initial_backoff
|
||||||
|
|
||||||
def split_principal(self, principal):
|
def split_principal(self, principal):
|
||||||
"""Split a principal into its components. Copied from IPA 4.0.0"""
|
"""Split a principal into its components. Copied from IPA 4.0.0"""
|
||||||
@ -129,16 +129,20 @@ class IPANovaJoinBase(object):
|
|||||||
def __backoff(self):
|
def __backoff(self):
|
||||||
LOG.debug("Backing off %s seconds", self.backoff)
|
LOG.debug("Backing off %s seconds", self.backoff)
|
||||||
time.sleep(self.backoff)
|
time.sleep(self.backoff)
|
||||||
if self.backoff < 1024:
|
if self.backoff < 512:
|
||||||
self.backoff = self.backoff * 2
|
self.backoff = self.backoff * 2
|
||||||
|
|
||||||
|
def __reset_backoff(self):
|
||||||
|
if self.backoff > self.initial_backoff:
|
||||||
|
LOG.debug("Resetting backoff to %d", self.initial_backoff)
|
||||||
|
self.backoff = self.initial_backoff
|
||||||
|
|
||||||
def __get_connection(self):
|
def __get_connection(self):
|
||||||
"""Make a connection to IPA or raise an error."""
|
"""Make a connection to IPA or raise an error."""
|
||||||
tries = 0
|
tries = 0
|
||||||
|
|
||||||
while (tries <= self.ntries) or (self.backoff > 0):
|
while (tries <= self.ntries):
|
||||||
if self.backoff == 0:
|
LOG.debug("Attempt %d of %d", tries, self.ntries)
|
||||||
LOG.debug("Attempt %d of %d", tries, self.ntries)
|
|
||||||
if api.Backend.rpcclient.isconnected():
|
if api.Backend.rpcclient.isconnected():
|
||||||
api.Backend.rpcclient.disconnect()
|
api.Backend.rpcclient.disconnect()
|
||||||
try:
|
try:
|
||||||
@ -149,6 +153,7 @@ class IPANovaJoinBase(object):
|
|||||||
except (errors.CCacheError,
|
except (errors.CCacheError,
|
||||||
errors.TicketExpired,
|
errors.TicketExpired,
|
||||||
errors.KerberosError) as e:
|
errors.KerberosError) as e:
|
||||||
|
tries += 1
|
||||||
LOG.debug("kinit again: %s", e)
|
LOG.debug("kinit again: %s", e)
|
||||||
# pylint: disable=no-member
|
# pylint: disable=no-member
|
||||||
try:
|
try:
|
||||||
@ -158,9 +163,8 @@ class IPANovaJoinBase(object):
|
|||||||
self.ccache)
|
self.ccache)
|
||||||
except GSSError as e:
|
except GSSError as e:
|
||||||
LOG.debug("kinit failed: %s", e)
|
LOG.debug("kinit failed: %s", e)
|
||||||
if tries > 0 and self.backoff:
|
if self.backoff:
|
||||||
self.__backoff()
|
self.__backoff()
|
||||||
tries += 1
|
|
||||||
except errors.NetworkError:
|
except errors.NetworkError:
|
||||||
tries += 1
|
tries += 1
|
||||||
if self.backoff:
|
if self.backoff:
|
||||||
@ -173,8 +177,13 @@ class IPANovaJoinBase(object):
|
|||||||
if self.backoff:
|
if self.backoff:
|
||||||
self.__backoff()
|
self.__backoff()
|
||||||
else:
|
else:
|
||||||
|
# successful connection
|
||||||
|
self.__reset_backoff()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
LOG.error("Failed to connect to IPA after %d attempts", self.ntries)
|
||||||
|
raise exception.IPAConnectionError(tries=self.ntries)
|
||||||
|
|
||||||
def start_batch_operation(self):
|
def start_batch_operation(self):
|
||||||
"""Start a batch operation.
|
"""Start a batch operation.
|
||||||
|
|
||||||
|
@ -41,11 +41,9 @@ CONF = config.CONF
|
|||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
BACKOFF = 2
|
|
||||||
|
|
||||||
|
|
||||||
def ipaclient():
|
def ipaclient():
|
||||||
return IPAClient(backoff=BACKOFF)
|
return IPAClient()
|
||||||
|
|
||||||
|
|
||||||
def novaclient():
|
def novaclient():
|
||||||
@ -194,9 +192,14 @@ class NotificationEndpoint(object):
|
|||||||
return
|
return
|
||||||
|
|
||||||
LOG.info("Delete host %s (%s)", instance_id, hostname)
|
LOG.info("Delete host %s (%s)", instance_id, hostname)
|
||||||
ipa = ipaclient()
|
try:
|
||||||
ipa.delete_host(hostname, {})
|
ipa = ipaclient()
|
||||||
self.delete_subhosts(ipa, hostname_short, payload_metadata)
|
ipa.delete_host(hostname, {})
|
||||||
|
self.delete_subhosts(ipa, hostname_short, payload_metadata)
|
||||||
|
except exception.IPAConnectionError:
|
||||||
|
LOG.error("IPA Connection Error when deleting host %s (%s). "
|
||||||
|
"Manual cleanup may be required in the IPA server.",
|
||||||
|
instance_id, hostname)
|
||||||
|
|
||||||
@event_handlers('network.floating_ip.associate')
|
@event_handlers('network.floating_ip.associate')
|
||||||
def floaitng_ip_associate(self, payload):
|
def floaitng_ip_associate(self, payload):
|
||||||
|
@ -0,0 +1,9 @@
|
|||||||
|
---
|
||||||
|
fixes:
|
||||||
|
- Fix the retry backoff mechanism. With the current mechanism, if a backoff
|
||||||
|
is set, we will retry ad infinitum, leading to inconsistent results when
|
||||||
|
a deletion can happen after the instance as been recreated. With the new
|
||||||
|
mechanism, retries will occur a maximum number of times, with or without
|
||||||
|
backoff. A new parameter (connect_backoff, default=0) is added to specify
|
||||||
|
the initial backoff duration. Also, the default for the parameter for the
|
||||||
|
number of retries (connect_retries) has been increased from 2 to 4.
|
Loading…
Reference in New Issue
Block a user