Retool sqlite retries

... To not try instantly, but also not to wait forever to retry.

Also, the maximum delay is also now the proper setting to cause
the attempt to exit, and is only set to 10 seconds, with a fairly
tight interval for retries to occur within.

This change also doesn't abort retries for releasing a node lock
and updating a node, both actions if they halt due to the close
out of a task, can be catastrophic to the underlying operation
and state, because internal actions around locking can't be retried
with a long interval, otherwise things break in very bad ways.

Change-Id: I2041e90bb0f7f522bde4338eceda97f0ae8b2c35
This commit is contained in:
Julia Kreger 2023-08-14 09:12:17 -07:00
parent 8e2aab8291
commit 4ba418716e
3 changed files with 27 additions and 9 deletions

View File

@ -26,9 +26,12 @@ opts = [
help=_('If SQLite database operation retry logic is enabled '
'or not. Enabled by default.')),
cfg.IntOpt('sqlite_max_wait_for_retry',
default=30,
default=10,
help=_('Maximum number of seconds to retry SQLite database '
'locks.')),
'locks, after which the original exception will be '
'returned to the caller. This does not presently apply '
'to internal node lock release actions and DB actions '
'centered around the completion of tasks.')),
]

View File

@ -70,10 +70,22 @@ def wrap_sqlite_retry(f):
@functools.wraps(f)
def wrapper(*args, **kwargs):
if (CONF.database.sqlite_retries
and not utils.is_ironic_using_sqlite()):
if (not CONF.database.sqlite_retries
or not utils.is_ironic_using_sqlite()):
return f(*args, **kwargs)
else:
# NOTE(TheJulia): We likely need to see if we can separate
# update_node in API from the final actions of task manager
# actions, but that would also be an internal API change
# because we would likely need a special object method to
# call for update_node to delineate an internal save versus
# an external save.
if f.__name__ in ['update_node', 'release_node']:
stop = tenacity.stop_never
else:
stop = tenacity.stop_after_delay(
max_delay=CONF.database.sqlite_max_wait_for_retry
)
for attempt in tenacity.Retrying(
retry=(
tenacity.retry_if_exception_type(
@ -81,13 +93,16 @@ def wrap_sqlite_retry(f):
& tenacity.retry_if_exception(
lambda e: 'database is locked' in str(e))
),
wait=tenacity.wait_full_jitter(
multiplier=0.25,
max=CONF.database.sqlite_max_wait_for_retry),
wait=tenacity.wait_random(
min=0.1,
max=1,
),
before_sleep=(
tenacity.before_sleep_log(LOG, logging.DEBUG)
),
reraise=True):
stop=stop,
reraise=False,
retry_error_cls=exception.TemporaryFailure):
with attempt:
return f(*args, **kwargs)
return wrapper

View File

@ -40,7 +40,7 @@ jsonpatch!=1.20,>=1.16 # BSD
Jinja2>=3.0.0 # BSD License (3 clause)
keystonemiddleware>=9.5.0 # Apache-2.0
oslo.messaging>=14.1.0 # Apache-2.0
tenacity>=6.2.0 # Apache-2.0
tenacity>=6.3.1 # Apache-2.0
oslo.versionedobjects>=1.31.2 # Apache-2.0
jsonschema>=3.2.0 # MIT
psutil>=3.2.2 # BSD