Browse Source

Fix backoff mechanism

Right now, the backoff mechanism is broken when the backoff is
set to something non-zero.  Basically, you go into this state where
you retry ad infinitum, leading to inconsistent behavior.

This change fixes the mechanism so that you only get a fixed number
of retries.  You can choose (through a new config parameter) to allow
backoff (or not).

To restore some of the old behavior, the default for the connect_retries
parameter has been increased from 2 to 4, and the max backoff time has
been decreased from 1024 to 512 seconds.  Its unlikely that we'd ever
reach that backoff time without a large number of retries, but 1024
seems too long.

And there is a new exception that is thrown when the connection
fails.  This will result in nice 500 errors in the novajoin-server,
and some log messages for the notifier.

Change-Id: I10547fbde8966c8694346ed8c054e627bee2ee51
tags/1.2.0
Ade Lee 1 month ago
parent
commit
6ed30c9476

+ 6
- 1
novajoin/config.py View File

@@ -32,9 +32,14 @@ service_opts = [
32 32
                help='Kerberos client keytab file'),
33 33
     cfg.StrOpt('domain', default=None,
34 34
                help='Domain for new hosts'),
35
-    cfg.IntOpt('connect_retries', default=2,
35
+    cfg.IntOpt('connect_retries', default=4,
36 36
                help='How many times to attempt to retry '
37 37
                'the connection to IPA before giving up'),
38
+    cfg.IntOpt('connect_backoff', default=0,
39
+               help='Initial number of seconds to backoff before '
40
+               'retrying the connection to IPA.  The backoff '
41
+               'time is doubled after each retry.  A value '
42
+               'of 0 disables backoff'),
38 43
     cfg.BoolOpt('project_subdomain', default=False,
39 44
                 help='Treat the project as a DNS subdomain '
40 45
                 'so a hostname would take the form: '

+ 4
- 0
novajoin/exception.py View File

@@ -153,3 +153,7 @@ class NotificationVersionMismatch(JoinException):
153 153
     message = ("Provided notification version "
154 154
                "%(provided_maj)s.%(provided_min)s did not match expected "
155 155
                "%(expected_maj)s.%(expected_min)s for %(type)s")
156
+
157
+
158
+class IPAConnectionError(JoinException):
159
+    message = "Unable to connect to IPA after %(tries) tries"

+ 22
- 13
novajoin/ipa.py View File

@@ -41,6 +41,7 @@ if ipalib_imported:
41 41
         except ImportError:
42 42
             ipalib_imported = False
43 43
 
44
+from novajoin import exception
44 45
 from novajoin.util import get_domain
45 46
 from oslo_config import cfg
46 47
 from oslo_log import log as logging
@@ -54,13 +55,12 @@ LOG = logging.getLogger(__name__)
54 55
 
55 56
 class IPANovaJoinBase(object):
56 57
 
57
-    def __init__(self, backoff=0):
58
-        try:
59
-            self.ntries = CONF.connect_retries
60
-        except cfg.NoSuchOptError:
61
-            self.ntries = 1
58
+    def __init__(self):
62 59
         if not ipalib_imported:
63 60
             return
61
+
62
+        self.ntries = CONF.connect_retries
63
+        self.initial_backoff = CONF.connect_backoff
64 64
         self.ccache = "MEMORY:" + str(uuid.uuid4())
65 65
         os.environ['KRB5CCNAME'] = self.ccache
66 66
         if self._ipa_client_configured() and not api.isdone('finalize'):
@@ -70,7 +70,7 @@ class IPANovaJoinBase(object):
70 70
             api.bootstrap(context='novajoin')
71 71
             api.finalize()
72 72
         self.batch_args = list()
73
-        self.backoff = backoff
73
+        self.backoff = self.initial_backoff
74 74
 
75 75
     def split_principal(self, principal):
76 76
         """Split a principal into its components. Copied from IPA 4.0.0"""
@@ -129,16 +129,20 @@ class IPANovaJoinBase(object):
129 129
     def __backoff(self):
130 130
         LOG.debug("Backing off %s seconds", self.backoff)
131 131
         time.sleep(self.backoff)
132
-        if self.backoff < 1024:
132
+        if self.backoff < 512:
133 133
             self.backoff = self.backoff * 2
134 134
 
135
+    def __reset_backoff(self):
136
+        if self.backoff > self.initial_backoff:
137
+            LOG.debug("Resetting backoff to %d", self.initial_backoff)
138
+            self.backoff = self.initial_backoff
139
+
135 140
     def __get_connection(self):
136 141
         """Make a connection to IPA or raise an error."""
137 142
         tries = 0
138 143
 
139
-        while (tries <= self.ntries) or (self.backoff > 0):
140
-            if self.backoff == 0:
141
-                LOG.debug("Attempt %d of %d", tries, self.ntries)
144
+        while (tries <= self.ntries):
145
+            LOG.debug("Attempt %d of %d", tries, self.ntries)
142 146
             if api.Backend.rpcclient.isconnected():
143 147
                 api.Backend.rpcclient.disconnect()
144 148
             try:
@@ -149,6 +153,7 @@ class IPANovaJoinBase(object):
149 153
             except (errors.CCacheError,
150 154
                     errors.TicketExpired,
151 155
                     errors.KerberosError) as e:
156
+                tries += 1
152 157
                 LOG.debug("kinit again: %s", e)
153 158
                 # pylint: disable=no-member
154 159
                 try:
@@ -158,9 +163,8 @@ class IPANovaJoinBase(object):
158 163
                                  self.ccache)
159 164
                 except GSSError as e:
160 165
                     LOG.debug("kinit failed: %s", e)
161
-                if tries > 0 and self.backoff:
162
-                    self.__backoff()
163
-                tries += 1
166
+                    if self.backoff:
167
+                        self.__backoff()
164 168
             except errors.NetworkError:
165 169
                 tries += 1
166 170
                 if self.backoff:
@@ -173,8 +177,13 @@ class IPANovaJoinBase(object):
173 177
                 if self.backoff:
174 178
                     self.__backoff()
175 179
             else:
180
+                # successful connection
181
+                self.__reset_backoff()
176 182
                 return
177 183
 
184
+        LOG.error("Failed to connect to IPA after %d attempts", self.ntries)
185
+        raise exception.IPAConnectionError(tries=self.ntries)
186
+
178 187
     def start_batch_operation(self):
179 188
         """Start a batch operation.
180 189
 

+ 9
- 6
novajoin/notifications.py View File

@@ -41,11 +41,9 @@ CONF = config.CONF
41 41
 
42 42
 LOG = logging.getLogger(__name__)
43 43
 
44
-BACKOFF = 2
45
-
46 44
 
47 45
 def ipaclient():
48
-    return IPAClient(backoff=BACKOFF)
46
+    return IPAClient()
49 47
 
50 48
 
51 49
 def novaclient():
@@ -194,9 +192,14 @@ class NotificationEndpoint(object):
194 192
             return
195 193
 
196 194
         LOG.info("Delete host %s (%s)", instance_id, hostname)
197
-        ipa = ipaclient()
198
-        ipa.delete_host(hostname, {})
199
-        self.delete_subhosts(ipa, hostname_short, payload_metadata)
195
+        try:
196
+            ipa = ipaclient()
197
+            ipa.delete_host(hostname, {})
198
+            self.delete_subhosts(ipa, hostname_short, payload_metadata)
199
+        except exception.IPAConnectionError:
200
+            LOG.error("IPA Connection Error when deleting host %s (%s).  "
201
+                      "Manual cleanup may be required in the IPA server.",
202
+                      instance_id, hostname)
200 203
 
201 204
     @event_handlers('network.floating_ip.associate')
202 205
     def floaitng_ip_associate(self, payload):

+ 9
- 0
novajoin/releasenotes/notes/fix-backoff-mechanism-c681255215624413.yaml View File

@@ -0,0 +1,9 @@
1
+---
2
+fixes:
3
+  - Fix the retry backoff mechanism.  With the current mechanism, if a backoff
4
+    is set, we will retry ad infinitum, leading to inconsistent results when
5
+    a deletion can happen after the instance as been recreated.  With the new
6
+    mechanism, retries will occur a maximum number of times, with or without
7
+    backoff.  A new parameter (connect_backoff, default=0) is added to specify
8
+    the initial backoff duration.  Also, the default for the parameter for the
9
+    number of retries (connect_retries) has been increased from 2 to 4.

Loading…
Cancel
Save