Do not silently ignore exceptions when running next steps

Currently if do_next_deploy/clean_step fails, the failure is swallowed
by eventlet since they're run in a new thread. This 1) is incorrect,
2) leads to nodes stuck in DEPLOYING/CLEANING.

Also update logging in agent_base to be able to easier spot similar
problems.

Change-Id: I0282c9e06c54a173efc666cd8df25cf573afb394
This commit is contained in:
Dmitry Tantsur 2020-09-28 14:37:47 +02:00
parent ac19e6050d
commit 0c3f52ec9a
4 changed files with 25 additions and 0 deletions

View File

@ -126,6 +126,8 @@ def do_node_clean(task, clean_steps=None):
do_next_clean_step(task, step_index)
@utils.fail_on_error(utils.deploying_error_handler,
_("Unexpected error when processing next clean step"))
@task_manager.require_exclusive_lock
def do_next_clean_step(task, step_index):
"""Do cleaning, starting from the specified clean step.

View File

@ -204,6 +204,8 @@ def do_node_deploy(task, conductor_id=None, configdrive=None):
do_next_deploy_step(task, 0, conductor_id)
@utils.fail_on_error(utils.deploying_error_handler,
_("Unexpected error when processing next deploy step"))
@task_manager.require_exclusive_lock
def do_next_deploy_step(task, step_index, conductor_id):
"""Do deployment, starting from the specified deploy step.

View File

@ -16,6 +16,7 @@ import contextlib
import crypt
import datetime
from distutils.version import StrictVersion
import functools
import os
import secrets
import time
@ -563,6 +564,21 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
task.process_event('fail')
def fail_on_error(error_callback, msg, *error_args, **error_kwargs):
"""A decorator for failing operation on failure."""
def wrapper(func):
@functools.wraps(func)
def wrapped(task, *args, **kwargs):
try:
return func(task, *args, **kwargs)
except Exception as exc:
errmsg = "%s. %s: %s" % (msg, exc.__class__.__name__, exc)
error_callback(task, errmsg, *error_args, **error_kwargs)
return wrapped
return wrapper
@task_manager.require_exclusive_lock
def abort_on_conductor_take_over(task):
"""Set node's state when a task was aborted due to conductor take over.

View File

@ -0,0 +1,5 @@
---
fixes:
- |
No longer silently ignores exceptions that happen when trying to run the
next clean or deploy step.