From 4ba418716ed22bd5eeb3b23c73b9427534e5ee54 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Mon, 14 Aug 2023 09:12:17 -0700 Subject: [PATCH] Retool sqlite retries ... To not try instantly, but also not to wait forever to retry. Also, the maximum delay is also now the proper setting to cause the attempt to exit, and is only set to 10 seconds, with a fairly tight interval for retries to occur within. This change also doesn't abort retries for releasing a node lock and updating a node, both actions if they halt due to the close out of a task, can be catastrophic to the underlying operation and state, because internal actions around locking can't be retried with a long interval, otherwise things break in very bad ways. Change-Id: I2041e90bb0f7f522bde4338eceda97f0ae8b2c35 --- ironic/conf/database.py | 7 +++++-- ironic/db/sqlalchemy/api.py | 27 +++++++++++++++++++++------ requirements.txt | 2 +- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/ironic/conf/database.py b/ironic/conf/database.py index 37986e1cd3..6a4349e927 100644 --- a/ironic/conf/database.py +++ b/ironic/conf/database.py @@ -26,9 +26,12 @@ opts = [ help=_('If SQLite database operation retry logic is enabled ' 'or not. Enabled by default.')), cfg.IntOpt('sqlite_max_wait_for_retry', - default=30, + default=10, help=_('Maximum number of seconds to retry SQLite database ' - 'locks.')), + 'locks, after which the original exception will be ' + 'returned to the caller. This does not presently apply ' + 'to internal node lock release actions and DB actions ' + 'centered around the completion of tasks.')), ] diff --git a/ironic/db/sqlalchemy/api.py b/ironic/db/sqlalchemy/api.py index 510891e098..bf591ca3de 100644 --- a/ironic/db/sqlalchemy/api.py +++ b/ironic/db/sqlalchemy/api.py @@ -70,10 +70,22 @@ def wrap_sqlite_retry(f): @functools.wraps(f) def wrapper(*args, **kwargs): - if (CONF.database.sqlite_retries - and not utils.is_ironic_using_sqlite()): + if (not CONF.database.sqlite_retries + or not utils.is_ironic_using_sqlite()): return f(*args, **kwargs) else: + # NOTE(TheJulia): We likely need to see if we can separate + # update_node in API from the final actions of task manager + # actions, but that would also be an internal API change + # because we would likely need a special object method to + # call for update_node to delineate an internal save versus + # an external save. + if f.__name__ in ['update_node', 'release_node']: + stop = tenacity.stop_never + else: + stop = tenacity.stop_after_delay( + max_delay=CONF.database.sqlite_max_wait_for_retry + ) for attempt in tenacity.Retrying( retry=( tenacity.retry_if_exception_type( @@ -81,13 +93,16 @@ def wrap_sqlite_retry(f): & tenacity.retry_if_exception( lambda e: 'database is locked' in str(e)) ), - wait=tenacity.wait_full_jitter( - multiplier=0.25, - max=CONF.database.sqlite_max_wait_for_retry), + wait=tenacity.wait_random( + min=0.1, + max=1, + ), before_sleep=( tenacity.before_sleep_log(LOG, logging.DEBUG) ), - reraise=True): + stop=stop, + reraise=False, + retry_error_cls=exception.TemporaryFailure): with attempt: return f(*args, **kwargs) return wrapper diff --git a/requirements.txt b/requirements.txt index 2f4813baae..d6918b9fec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -40,7 +40,7 @@ jsonpatch!=1.20,>=1.16 # BSD Jinja2>=3.0.0 # BSD License (3 clause) keystonemiddleware>=9.5.0 # Apache-2.0 oslo.messaging>=14.1.0 # Apache-2.0 -tenacity>=6.2.0 # Apache-2.0 +tenacity>=6.3.1 # Apache-2.0 oslo.versionedobjects>=1.31.2 # Apache-2.0 jsonschema>=3.2.0 # MIT psutil>=3.2.2 # BSD