From 0c3f52ec9a12c5854d326bc95fab78e6e87b4dcb Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Mon, 28 Sep 2020 14:37:47 +0200 Subject: [PATCH] Do not silently ignore exceptions when running next steps Currently if do_next_deploy/clean_step fails, the failure is swallowed by eventlet since they're run in a new thread. This 1) is incorrect, 2) leads to nodes stuck in DEPLOYING/CLEANING. Also update logging in agent_base to be able to easier spot similar problems. Change-Id: I0282c9e06c54a173efc666cd8df25cf573afb394 --- ironic/conductor/cleaning.py | 2 ++ ironic/conductor/deployments.py | 2 ++ ironic/conductor/utils.py | 16 ++++++++++++++++ .../notes/spawn-error-2249f94606388fbd.yaml | 5 +++++ 4 files changed, 25 insertions(+) create mode 100644 releasenotes/notes/spawn-error-2249f94606388fbd.yaml diff --git a/ironic/conductor/cleaning.py b/ironic/conductor/cleaning.py index 9a923abf7b..8a1b7be5d1 100644 --- a/ironic/conductor/cleaning.py +++ b/ironic/conductor/cleaning.py @@ -126,6 +126,8 @@ def do_node_clean(task, clean_steps=None): do_next_clean_step(task, step_index) +@utils.fail_on_error(utils.deploying_error_handler, + _("Unexpected error when processing next clean step")) @task_manager.require_exclusive_lock def do_next_clean_step(task, step_index): """Do cleaning, starting from the specified clean step. diff --git a/ironic/conductor/deployments.py b/ironic/conductor/deployments.py index 780b302c59..54e05a62cd 100644 --- a/ironic/conductor/deployments.py +++ b/ironic/conductor/deployments.py @@ -204,6 +204,8 @@ def do_node_deploy(task, conductor_id=None, configdrive=None): do_next_deploy_step(task, 0, conductor_id) +@utils.fail_on_error(utils.deploying_error_handler, + _("Unexpected error when processing next deploy step")) @task_manager.require_exclusive_lock def do_next_deploy_step(task, step_index, conductor_id): """Do deployment, starting from the specified deploy step. diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index 26c590f4d5..8c5dd8e3f9 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -16,6 +16,7 @@ import contextlib import crypt import datetime from distutils.version import StrictVersion +import functools import os import secrets import time @@ -563,6 +564,21 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False, task.process_event('fail') +def fail_on_error(error_callback, msg, *error_args, **error_kwargs): + """A decorator for failing operation on failure.""" + def wrapper(func): + @functools.wraps(func) + def wrapped(task, *args, **kwargs): + try: + return func(task, *args, **kwargs) + except Exception as exc: + errmsg = "%s. %s: %s" % (msg, exc.__class__.__name__, exc) + error_callback(task, errmsg, *error_args, **error_kwargs) + + return wrapped + return wrapper + + @task_manager.require_exclusive_lock def abort_on_conductor_take_over(task): """Set node's state when a task was aborted due to conductor take over. diff --git a/releasenotes/notes/spawn-error-2249f94606388fbd.yaml b/releasenotes/notes/spawn-error-2249f94606388fbd.yaml new file mode 100644 index 0000000000..51e4d6795a --- /dev/null +++ b/releasenotes/notes/spawn-error-2249f94606388fbd.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + No longer silently ignores exceptions that happen when trying to run the + next clean or deploy step.