Record node history and manage events in db

* Adds periodic task to purge node_history entries based upon
  provided configuration.
* Adds recording of node history entries for errors in the
  core conductor code.
* Also changes the rescue abort behavior to remove the notice
  from being recorded as an error, as this is a likely bug in
  behavior for any process or service evaluating the node
  last_error field.
* Makes use of a semi-free form event_type field to help
  provide some additional context into what is going on and
  why. For example if deployments are repeatedly failing,
  then perhaps it is a configuration issue, as opposed to
  a general failure. If a conductor has no resources, then
  the failure, in theory would point back to the conductor
  itself.

Story: 2002980
Task: 42960

Change-Id: Ibfa8ac4878cacd98a43dd4424f6d53021ad91166
This commit is contained in:
Julia Kreger 2021-08-02 16:07:46 -07:00
parent fbaad948d8
commit d17749249c
12 changed files with 622 additions and 39 deletions

View File

@ -297,6 +297,25 @@ ALLOCATING = 'allocating'
# States ERROR and ACTIVE are reused.
###########################
# History Event State Types
###########################
PROVISIONING = "provisioning"
CLEANING = "cleaning"
DEPLOYING = "deploying"
TAKEOVER = "takeover"
INTROSPECTION = "introspection"
RESCUE = "rescue"
CONDUCTOR = "conductor"
TRANSITION = "transition"
STARTFAIL = "startup failure"
UNPROVISION = "unprovision"
ADOPTION = "adoption"
CONSOLE = "console"
MONITORING = "monitoring"
VERIFY = "verify"
#####################
# State machine model

View File

@ -37,6 +37,7 @@ from ironic.common import states
from ironic.conductor import allocations
from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import task_manager
from ironic.conductor import utils
from ironic.conf import CONF
from ironic.db import api as dbapi
from ironic.drivers import base as driver_base
@ -536,7 +537,10 @@ class BaseConductorManager(object):
err_handler=err_handler,
target_state=target_state)
else:
task.node.last_error = last_error
utils.node_history_record(
task.node, event=last_error,
error=True,
event_type=states.TRANSITION)
task.process_event('fail', target_state=target_state)
except exception.NoFreeConductorWorker:
break
@ -580,7 +584,9 @@ class BaseConductorManager(object):
LOG.error(msg)
# If starting console failed, set node console_enabled
# back to False and set node's last error.
task.node.last_error = msg
utils.node_history_record(task.node, event=msg,
error=True,
event_type=states.STARTFAIL)
task.node.console_enabled = False
task.node.save()
notify_utils.emit_console_notification(

View File

@ -712,7 +712,11 @@ class ConductorManager(base_manager.BaseConductorManager):
def handle_failure(e, errmsg, log_func=LOG.error):
utils.remove_node_rescue_password(node, save=False)
node.last_error = errmsg % e
error = errmsg % e
utils.node_history_record(task.node, event=error,
event_type=states.RESCUE,
error=True,
user=task.context.user_id)
task.process_event('fail')
log_func('Error while performing rescue operation for node '
'%(node)s with instance %(instance)s: %(err)s',
@ -801,7 +805,11 @@ class ConductorManager(base_manager.BaseConductorManager):
node = task.node
def handle_failure(e, errmsg, log_func=LOG.error):
node.last_error = errmsg % e
error = errmsg % e
utils.node_history_record(task.node, event=error,
event_type=states.RESCUE,
error=True,
user=task.context.user_id)
task.process_event('fail')
log_func('Error while performing unrescue operation for node '
'%(node)s with instance %(instance)s: %(err)s',
@ -845,7 +853,9 @@ class ConductorManager(base_manager.BaseConductorManager):
error_msg = _('Failed to clean up rescue after aborting '
'the operation')
node.refresh()
node.last_error = error_msg
utils.node_history_record(node, event=error_msg,
event_type=states.RESCUE, error=True,
user=task.context.user_id)
node.maintenance = True
node.maintenance_reason = error_msg
node.fault = faults.RESCUE_ABORT_FAILURE
@ -853,10 +863,15 @@ class ConductorManager(base_manager.BaseConductorManager):
return
info_message = _('Rescue operation aborted for node %s.') % node.uuid
last_error = _('By request, the rescue operation was aborted.')
# NOTE(TheJulia): This "error" is not an actual error, the operation
# has been aborted and the node returned to normal operation.
error = _('By request, the rescue operation was aborted.')
utils.node_history_record(task.node, event=error,
event_type=states.RESCUE,
error=False,
user=task.context.user_id)
node.refresh()
utils.remove_agent_url(node)
node.last_error = last_error
node.save()
LOG.info(info_message)
@ -1053,7 +1068,11 @@ class ConductorManager(base_manager.BaseConductorManager):
with excutils.save_and_reraise_exception():
LOG.exception('Error in tear_down of node %(node)s: %(err)s',
{'node': node.uuid, 'err': e})
node.last_error = _("Failed to tear down. Error: %s") % e
error = _("Failed to tear down. Error: %s") % e
utils.node_history_record(task.node, event=error,
event_type=states.UNPROVISION,
error=True,
user=task.context.user_id)
task.process_event('fail')
else:
# NOTE(tenbrae): When tear_down finishes, the deletion is done,
@ -1339,10 +1358,18 @@ class ConductorManager(base_manager.BaseConductorManager):
with excutils.save_and_reraise_exception():
LOG.exception('Error in aborting the inspection of '
'node %(node)s', {'node': node.uuid})
node.last_error = _('Failed to abort inspection. '
'Error: %s') % e
error = _('Failed to abort inspection. '
'Error: %s') % e
utils.node_history_record(task.node, event=error,
event_type=states.INTROSPECTION,
error=True,
user=task.context.user_id)
node.save()
node.last_error = _('Inspection was aborted by request.')
error = _('Inspection was aborted by request.')
utils.node_history_record(task.node, event=error,
event_type=states.INTROSPECTION,
error=True,
user=task.context.user_id)
utils.wipe_token_and_url(task)
task.process_event('abort')
LOG.info('Successfully aborted inspection of node %(node)s',
@ -1659,9 +1686,14 @@ class ConductorManager(base_manager.BaseConductorManager):
if not task.node.maintenance and task.node.target_power_state:
old_state = task.node.target_power_state
task.node.target_power_state = None
task.node.last_error = _('Pending power operation was '
'aborted due to conductor take '
'over')
error = _('Pending power operation was '
'aborted due to conductor take '
'over')
utils.node_history_record(task.node, event=error,
event_type=states.TAKEOVER,
error=True,
user=task.context.user_id)
task.node.save()
LOG.warning('Aborted pending power operation %(op)s '
'on node %(node)s due to conductor take over',
@ -1725,7 +1757,10 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.error(msg)
# Wipe power state from being preserved as it is likely invalid.
node.power_state = states.NOSTATE
node.last_error = msg
utils.node_history_record(task.node, event=msg,
event_type=states.ADOPTION,
error=True,
user=task.context.user_id)
task.process_event('fail')
@METRICS.timer('ConductorManager._do_takeover')
@ -1764,7 +1799,10 @@ class ConductorManager(base_manager.BaseConductorManager):
LOG.error(msg)
# If taking over console failed, set node's console_enabled
# back to False and set node's last error.
task.node.last_error = msg
utils.node_history_record(task.node, event=msg,
event_type=states.TAKEOVER,
error=True,
user=task.context.user_id)
task.node.console_enabled = False
console_error = True
else:
@ -2231,7 +2269,10 @@ class ConductorManager(base_manager.BaseConductorManager):
'Reason: %(error)s') % {'op': op,
'node': node.uuid,
'error': e})
node.last_error = msg
utils.node_history_record(task.node, event=msg,
event_type=states.CONSOLE,
error=True,
user=task.context.user_id)
LOG.error(msg)
node.save()
notify_utils.emit_console_notification(
@ -3495,6 +3536,55 @@ class ConductorManager(base_manager.BaseConductorManager):
task.node.save()
return task.node
@METRICS.timer('ConductorManager.manage_node_history')
@periodics.periodic(
spacing=CONF.conductor.node_history_cleanup_interval,
enabled=(
CONF.conductor.node_history_cleanup_batch_count > 0
and CONF.conductor.node_history_max_entries != 0
)
)
def manage_node_history(self, context):
try:
self._manage_node_history(context)
except Exception as e:
LOG.error('Encountered error while cleaning node '
'history records: %s', e)
def _manage_node_history(self, context):
"""Periodic task to keep the node history tidy."""
max_batch = CONF.conductor.node_history_cleanup_batch_count
# NOTE(TheJulia): Asks the db for the list. Presently just gets
# the node id and the count. If we incorporate by date constraint
# or handling, then it will need to be something like the method
# needs to identify the explicit ID values to delete, and then
# the deletion process needs to erase in logical chunks.
entries_to_clean = self.dbapi.query_node_history_records_for_purge(
conductor_id=self.conductor.id)
count = 0
for node_id in entries_to_clean:
if count < max_batch:
# If we have not hit our total limit, proceed
if entries_to_clean[node_id]:
# if we have work to do on this node, proceed.
self.dbapi.bulk_delete_node_history_records(
entries_to_clean[node_id])
else:
LOG.warning('While cleaning up node history records, '
'we reached the maximum number of records '
'permitted in a single batch. If this error '
'is repeated, consider tuning node history '
'configuration options to be more aggressive '
'by increasing frequency and lowering the '
'number of entries to be deleted to not '
'negatively impact performance.')
break
count = count + len(entries_to_clean[node_id])
# Yield to other threads, since we also don't want to be
# looping tightly deleting rows as that will negatively
# impact DB access if done in excess.
eventlet.sleep(0)
@METRICS.timer('get_vendor_passthru_metadata')
def get_vendor_passthru_metadata(route_dict):
@ -3534,7 +3624,9 @@ def handle_sync_power_state_max_retries_exceeded(task, actual_power_state,
old_power_state = node.power_state
node.power_state = actual_power_state
node.last_error = msg
utils.node_history_record(task.node, event=msg,
event_type=states.MONITORING,
error=True)
node.maintenance = True
node.maintenance_reason = msg
node.fault = faults.POWER_FAILURE
@ -3688,7 +3780,9 @@ def _do_inspect_hardware(task):
node = task.node
def handle_failure(e, log_func=LOG.error):
node.last_error = e
utils.node_history_record(task.node, event=e,
event_type=states.INTROSPECTION,
error=True, user=task.context.user_id)
task.process_event('fail')
log_func("Failed to inspect node %(node)s: %(err)s",
{'node': node.uuid, 'err': e})

View File

@ -39,6 +39,7 @@ from ironic.common import states
from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import task_manager
from ironic.objects import fields
from ironic.objects import node_history
LOG = log.getLogger(__name__)
CONF = cfg.CONF
@ -245,9 +246,10 @@ def _can_skip_state_change(task, new_state):
curr_state = task.driver.power.get_power_state(task)
except Exception as e:
with excutils.save_and_reraise_exception():
node['last_error'] = _(
error = _(
"Failed to change power state to '%(target)s'. "
"Error: %(error)s") % {'target': new_state, 'error': e}
node_history_record(node, event=error, error=True)
node['target_power_state'] = states.NOSTATE
node.save()
notify_utils.emit_power_set_notification(
@ -329,12 +331,13 @@ def node_power_action(task, new_state, timeout=None):
except Exception as e:
with excutils.save_and_reraise_exception():
node['target_power_state'] = states.NOSTATE
node['last_error'] = _(
error = _(
"Failed to change power state to '%(target_state)s' "
"by '%(new_state)s'. Error: %(error)s") % {
'target_state': target_state,
'new_state': new_state,
'error': e}
node_history_record(node, event=error, error=True)
node.save()
notify_utils.emit_power_set_notification(
task, fields.NotificationLevel.ERROR,
@ -399,7 +402,9 @@ def provisioning_error_handler(e, node, provision_state,
# because it isn't updated on a failed deploy
node.provision_state = provision_state
node.target_provision_state = target_provision_state
node.last_error = (_("No free conductor workers available"))
error = (_("No free conductor workers available"))
node_history_record(node, event=error, event_type=states.PROVISIONING,
error=True)
node.save()
LOG.warning("No free conductor workers available to perform "
"an action on node %(node)s, setting node's "
@ -483,7 +488,8 @@ def cleaning_error_handler(task, logmsg, errmsg=None, traceback=False,
# For manual cleaning, the target provision state is MANAGEABLE, whereas
# for automated cleaning, it is AVAILABLE.
manual_clean = node.target_provision_state == states.MANAGEABLE
node.last_error = errmsg
node_history_record(node, event=errmsg, event_type=states.CLEANING,
error=True)
# NOTE(dtantsur): avoid overwriting existing maintenance_reason
if not node.maintenance_reason and set_maintenance:
node.maintenance_reason = errmsg
@ -570,7 +576,8 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
errmsg = errmsg or logmsg
node = task.node
LOG.error(logmsg, exc_info=traceback)
node.last_error = errmsg
node_history_record(node, event=errmsg, event_type=states.DEPLOYING,
error=True)
node.save()
cleanup_err = None
@ -600,7 +607,9 @@ def deploying_error_handler(task, logmsg, errmsg=None, traceback=False,
wipe_deploy_internal_info(task)
if cleanup_err:
node.last_error = cleanup_err
node_history_record(node, event=cleanup_err,
event_type=states.DEPLOYING,
error=True)
node.save()
# NOTE(tenbrae): there is no need to clear conductor_affinity
@ -636,7 +645,8 @@ def abort_on_conductor_take_over(task):
else:
# For aborted deployment (and potentially other operations), just set
# the last_error accordingly.
task.node.last_error = msg
node_history_record(task.node, event=msg, event_type=states.TAKEOVER,
error=True)
task.node.save()
LOG.warning('Aborted the current operation on node %s due to '
@ -657,17 +667,22 @@ def rescuing_error_handler(task, msg, set_fail_state=True):
node_power_action(task, states.POWER_OFF)
task.driver.rescue.clean_up(task)
remove_agent_url(node)
node.last_error = msg
node_history_record(task.node, event=msg, event_type=states.RESCUE,
error=True)
except exception.IronicException as e:
node.last_error = (_('Rescue operation was unsuccessful, clean up '
'failed for node: %(error)s') % {'error': e})
error = (_('Rescue operation was unsuccessful, clean up '
'failed for node: %(error)s') % {'error': e})
node_history_record(task.node, event=error, event_type=states.RESCUE,
error=True)
LOG.error(('Rescue operation was unsuccessful, clean up failed for '
'node %(node)s: %(error)s'),
{'node': node.uuid, 'error': e})
except Exception as e:
node.last_error = (_('Rescue failed, but an unhandled exception was '
'encountered while aborting: %(error)s') %
{'error': e})
error = (_('Rescue failed, but an unhandled exception was '
'encountered while aborting: %(error)s') %
{'error': e})
node_history_record(task.node, event=error, event_type=states.RESCUE,
error=True)
LOG.exception('Rescue failed for node %(node)s, an exception was '
'encountered while aborting.', {'node': node.uuid})
finally:
@ -708,7 +723,9 @@ def _spawn_error_handler(e, node, operation):
:param operation: the operation being performed on the node.
"""
if isinstance(e, exception.NoFreeConductorWorker):
node.last_error = (_("No free conductor workers available"))
error = (_("No free conductor workers available"))
node_history_record(node, event=error, event_type=states.CONDUCTOR,
error=True)
node.save()
LOG.warning("No free conductor workers available to perform "
"%(operation)s on node %(node)s",
@ -749,7 +766,9 @@ def power_state_error_handler(e, node, power_state):
if isinstance(e, exception.NoFreeConductorWorker):
node.power_state = power_state
node.target_power_state = states.NOSTATE
node.last_error = (_("No free conductor workers available"))
error = (_("No free conductor workers available"))
node_history_record(node, event=error, event_type=states.CONDUCTOR,
error=True)
node.save()
LOG.warning("No free conductor workers available to perform "
"an action on node %(node)s, setting node's "
@ -1573,3 +1592,58 @@ def node_change_secure_boot(task, secure_boot_target):
{'state': secure_boot_target, 'node': task.node.uuid})
task.node.secure_boot = secure_boot_target
task.node.save()
def node_history_record(node, conductor=None, event=None,
event_type=None, user=None,
error=False):
"""Records a node history record
Adds an entry to the node history table with the appropriate fields
populated to ensure consistent experience by also updating the
node ``last_error`` field. Please note the event is only recorded
if the ``[conductor]node_history_max_size`` parameter is set to a
value greater than ``0``.
:param node: A node object from a task object. Required.
:param conductor: The hostname of the conductor. If not specified
this value is populated with the conductor FQDN.
:param event: The text to record to the node history table.
If no value is supplied, the method silently returns
to the caller.
:param event_type: The type activity where the event was encountered,
either "provisioning", "monitoring", "cleaning",
or whatever text the a driver author wishes to supply
based upon the activity. The purpose is to help guide
an API consumer/operator to have a better contextual
understanding of what was going on *when* the "event"
occured.
:param user: The user_id value which triggered the request,
if available.
:param error: Boolean value, default false, to signify if the event
is an error which should be recorded in the node
``last_error`` field.
:returns: None. No value is returned by this method.
"""
if not event:
# No error has occured, apparently.
return
if error:
# When the task exits out or is saved, the event
# or error is saved, but that is outside of ceating an
# entry in the history table.
node.last_error = event
if not conductor:
conductor = CONF.host
if CONF.conductor.node_history:
# If the maximum number of entries is not set to zero,
# then we should record the entry.
# NOTE(TheJulia): DB API automatically adds in a uuid.
# TODO(TheJulia): At some point, we should allow custom severity.
node_history.NodeHistory(
node_id=node.id,
conductor=CONF.host,
user=user,
severity=error and "ERROR" or "INFO",
event=event,
event_type=event_type or "UNKNOWN").create()

View File

@ -16,6 +16,7 @@ from oslo_log import log
from ironic.common import exception
from ironic.common.i18n import _
from ironic.common import states
from ironic.conductor import notification_utils as notify_utils
from ironic.conductor import task_manager
from ironic.conductor import utils
@ -67,5 +68,8 @@ def do_node_verify(task):
else:
task.process_event('done')
else:
node.last_error = error
utils.node_history_record(task.node, event=error,
event_type=states.VERIFY,
error=True,
user=task.context.user_id)
task.process_event('fail')

View File

@ -283,6 +283,57 @@ opts = [
'will not run during cleaning. If unset for an '
'inband clean step, will use the priority set in the '
'ramdisk.')),
cfg.BoolOpt('node_history',
default=True,
mutable=True,
help=_('Boolean value, default True, if node event history '
'is to be recorded. Errors and other noteworthy '
'events in relation to a node are journaled to a '
'database table which incurs some additional load. '
'A periodic task does periodically remove entries '
'from the database. Please note, if this is disabled, '
'the conductor will continue to purge entries as '
'long as [conductor]node_history_cleanup_batch_count '
'is not 0.')),
cfg.IntOpt('node_history_max_entries',
default=300,
min=0,
mutable=True,
help=_('Maximum number of history entries which will be stored '
'in the database per node. Default is 300. This setting '
'excludes the minimum number of days retained using the '
'[conductor]node_history_minimum_days setting.')),
cfg.IntOpt('node_history_cleanup_interval',
min=0,
default=86400,
mutable=False,
help=_('Interval in seconds at which node history entries '
'can be cleaned up in the database. Setting to 0 '
'disables the periodic task. Defaults to once a day, '
'or 86400 seconds.')),
cfg.IntOpt('node_history_cleanup_batch_count',
min=0,
default=1000,
mutable=False,
help=_('The target number of node history records to purge '
'from the database when performing clean-up. '
'Deletes are performed by node, and a node with excess '
'records for a node will still be deleted. '
'Defaults to 1000. Operators who find node history '
'building up may wish to '
'lower this threshold and decrease the time between '
'cleanup operations using the '
'``node_history_cleanup_interval`` setting.')),
cfg.IntOpt('node_history_minimum_days',
min=0,
default=0,
mutable=True,
help=_('The minimum number of days to explicitly keep on '
'hand in the database history entries for nodes. '
'This is exclusive from the [conductor]'
'node_history_max_entries setting as users of '
'this setting are anticipated to need to retain '
'history by policy.')),
]

View File

@ -1380,3 +1380,21 @@ class Connection(object, metaclass=abc.ABCMeta):
(asc, desc)
:returns: A list of histories.
"""
@abc.abstractmethod
def query_node_history_records_for_purge(self, conductor_id):
"""Utility method to identify nodes to clean history records for.
:param conductor_id: Id value for the conductor to perform this
query on behalf of.
:returns: A dictionary with key values of node database ID values
and a list of values associated with the node.
"""
@abc.abstractmethod
def bulk_delete_node_history_records(self, node_id, limit):
"""Utility method to bulk delete node history entries.
:param entires: A list of node history entriy id's to be
queried for deletion.
"""

View File

@ -2329,3 +2329,74 @@ class Connection(api.Connection):
query = query.filter_by(node_id=node_id)
return _paginate_query(models.NodeHistory, limit, marker,
sort_key, sort_dir, query)
def query_node_history_records_for_purge(self, conductor_id):
min_days = CONF.conductor.node_history_minimum_days
max_num = CONF.conductor.node_history_max_entries
with _session_for_read() as session:
# First, figure out our nodes.
nodes = session.query(
models.Node.id,
).filter(
models.Node.conductor_affinity == conductor_id
)
# Build our query to get the node_id and record id.
query = session.query(
models.NodeHistory.node_id,
models.NodeHistory.id,
)
# Filter by the nodes
query = query.filter(
models.NodeHistory.node_id.in_(nodes)
).order_by(
# Order in an ascending order as older is always first.
models.NodeHistory.created_at.asc()
)
# Filter by minimum days
if min_days > 0:
before = datetime.datetime.now() - datetime.timedelta(
days=min_days)
query = query.filter(
models.NodeHistory.created_at < before
)
# Build our result set
result_set = {}
for entry in query.all():
if entry[0] not in result_set:
result_set[entry[0]] = []
result_set[entry[0]].append(entry[1])
final_set = {}
# Generate our final set of entries which should be removed
# by accounting for the number of permitted entries.
for entry in result_set:
final_set[entry] = []
set_len = len(result_set[entry])
# Any count <= the maximum number is okay
if set_len > max_num:
# figure out how many entries need to be removed
num_to_remove = set_len - max_num
for i in range(0, num_to_remove):
final_set[entry].append(result_set[entry][i])
# remove the entries at the end of the list
# which will be the more recent items as we
# ordered ascending originally.
print('returning final set')
print(final_set)
return final_set
def bulk_delete_node_history_records(self, entries):
with _session_for_write() as session:
# Uses input entry list, selects entries matching those ids
# then deletes them and does not synchronize the session so
# sqlalchemy doesn't do extra un-necessary work.
session.query(
models.NodeHistory
).filter(
models.NodeHistory.id.in_(entries)
).delete(synchronize_session=False)

View File

@ -542,6 +542,12 @@ class StartConsolesTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
fields.NotificationStatus.START),
mock.call(mock.ANY, 'console_restore',
fields.NotificationStatus.ERROR)])
history = objects.NodeHistory.list_by_node_id(self.context,
test_node.id)
entry = history[0]
self.assertEqual('startup failure', entry['event_type'])
self.assertEqual('ERROR', entry['severity'])
self.assertIsNotNone(entry['event'])
@mock.patch.object(base_manager, 'LOG', autospec=True)
def test__start_consoles_node_locked(self, log_mock, mock_notify,
@ -599,3 +605,9 @@ class MiscTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.service._fail_transient_state(states.DEPLOYING, 'unknown err')
node.refresh()
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
history = objects.NodeHistory.list_by_node_id(self.context,
node.id)
entry = history[0]
self.assertEqual('transition', entry['event_type'])
self.assertEqual('ERROR', entry['severity'])
self.assertEqual('unknown err', entry['event'])

View File

@ -3361,7 +3361,7 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin,
with task_manager.acquire(self.context, node.uuid) as task:
self.service._do_node_rescue_abort(task)
clean_up_mock.assert_called_once_with(task.driver.rescue, task)
self.assertIsNotNone(task.node.last_error)
self.assertIsNone(task.node.last_error)
self.assertFalse(task.node.maintenance)
self.assertNotIn('agent_url', task.node.driver_internal_info)
@ -8153,3 +8153,166 @@ class DoNodeInspectAbortTestCase(mgr_utils.CommonMixIn,
self.assertIn('Inspection was aborted', node.last_error)
self.assertNotIn('agent_url', node.driver_internal_info)
self.assertNotIn('agent_secret_token', node.driver_internal_info)
class NodeHistoryRecordCleanupTestCase(mgr_utils.ServiceSetUpMixin,
db_base.DbTestCase):
def setUp(self):
super(NodeHistoryRecordCleanupTestCase, self).setUp()
self.node1 = obj_utils.get_test_node(self.context,
driver='fake-hardware',
id=10,
uuid=uuidutils.generate_uuid(),
conductor_affinity=1)
self.node2 = obj_utils.get_test_node(self.context,
driver='fake-hardware',
id=11,
uuid=uuidutils.generate_uuid(),
conductor_affinity=1)
self.node3 = obj_utils.get_test_node(self.context,
driver='fake-hardware',
id=12,
uuid=uuidutils.generate_uuid(),
conductor_affinity=1)
self.nodes = [self.node1, self.node2, self.node3]
# Create the nodes, as the tasks need to operate across tables.
self.node1.create()
self.node2.create()
self.node3.create()
CONF.set_override('node_history_max_entries', 2, group='conductor')
CONF.set_override('node_history_cleanup_batch_count', 2,
group='conductor')
self._start_service()
def test_history_is_pruned_to_config(self):
for node in self.nodes:
for event in ['one', 'two', 'three']:
conductor_utils.node_history_record(node, event=event)
conductor_utils.node_history_record(self.node1, event="final")
self.service._manage_node_history(self.context)
events = objects.NodeHistory.list(self.context)
self.assertEqual(8, len(events))
events = objects.NodeHistory.list_by_node_id(self.context, 10)
self.assertEqual('three', events[0].event)
self.assertEqual('final', events[1].event)
self.assertEqual(2, len(events))
events = objects.NodeHistory.list_by_node_id(self.context, 11)
self.assertEqual(3, len(events))
self.assertEqual('one', events[0].event)
self.assertEqual('two', events[1].event)
self.assertEqual('three', events[2].event)
events = objects.NodeHistory.list_by_node_id(self.context, 12)
self.assertEqual(3, len(events))
self.assertEqual('one', events[0].event)
self.assertEqual('two', events[1].event)
self.assertEqual('three', events[2].event)
def test_history_is_pruned_to_config_two_pass(self):
for node in self.nodes:
for event in ['one', 'two', 'three']:
conductor_utils.node_history_record(node, event=event)
conductor_utils.node_history_record(self.node1, event="final")
with mock.patch.object(manager.LOG, 'warning',
autospec=True) as mock_log:
self.service._manage_node_history(self.context)
# assert we did call a warning entry
self.assertEqual(1, mock_log.call_count)
events = objects.NodeHistory.list(self.context)
self.assertEqual(8, len(events))
self.service._manage_node_history(self.context)
events = objects.NodeHistory.list(self.context)
print(events)
self.assertEqual(6, len(events))
events = objects.NodeHistory.list_by_node_id(self.context, 10)
self.assertEqual(2, len(events))
self.assertEqual('three', events[0].event)
self.assertEqual('final', events[1].event)
events = objects.NodeHistory.list_by_node_id(self.context, 11)
self.assertEqual(2, len(events))
self.assertEqual('two', events[0].event)
self.assertEqual('three', events[1].event)
events = objects.NodeHistory.list_by_node_id(self.context, 12)
self.assertEqual(2, len(events))
self.assertEqual('two', events[0].event)
self.assertEqual('three', events[1].event)
def test_history_is_pruned_from_all_nodes_one_pass(self):
CONF.set_override('node_history_cleanup_batch_count', 15,
group='conductor')
for node in self.nodes:
for event in ['one', 'two', 'three']:
conductor_utils.node_history_record(node, event=event)
self.service._manage_node_history(self.context)
events = objects.NodeHistory.list(self.context)
self.assertEqual(6, len(events))
def test_history_pruning_no_work(self):
conductor_utils.node_history_record(self.node1, event='meow')
with mock.patch.object(self.dbapi,
'bulk_delete_node_history_records',
autospec=True) as mock_delete:
self.service._manage_node_history(self.context)
mock_delete.assert_not_called()
events = objects.NodeHistory.list(self.context)
self.assertEqual(1, len(events))
def test_history_pruning_not_other_conductor(self):
node = obj_utils.get_test_node(self.context,
driver='fake-hardware',
id=33,
uuid=uuidutils.generate_uuid(),
conductor_affinity=2)
# create node so it can be queried
node.create()
for i in range(0, 3):
conductor_utils.node_history_record(node, event='meow%s' % i)
with mock.patch.object(self.dbapi,
'bulk_delete_node_history_records',
autospec=True) as mock_delete:
self.service._manage_node_history(self.context)
mock_delete.assert_not_called()
events = objects.NodeHistory.list(self.context)
self.assertEqual(3, len(events))
self.assertEqual('meow0', events[0].event)
def test_history_is_pruned_to_config_with_days(self):
CONF.set_override('node_history_cleanup_batch_count', 15,
group='conductor')
CONF.set_override('node_history_minimum_days', 1,
group='conductor')
CONF.set_override('node_history_max_entries', 1, group='conductor')
old_date = datetime.datetime.now() - datetime.timedelta(days=7)
# Creates 18 entries
for node in self.nodes:
for event in ['oldone', 'oldtwo', 'oldthree']:
objects.NodeHistory(created_at=old_date,
event=event,
node_id=node.id).create()
for event in ['one', 'two', 'three']:
conductor_utils.node_history_record(node, event=event)
# 9 retained due to days, 3 to config
self.service._manage_node_history(self.context)
events = objects.NodeHistory.list(self.context)
print(events)
self.assertEqual(12, len(events))
events = objects.NodeHistory.list_by_node_id(self.context, 10)
self.assertEqual(4, len(events))
self.assertEqual('oldthree', events[0].event)
self.assertEqual('one', events[1].event)
self.assertEqual('two', events[2].event)
self.assertEqual('three', events[3].event)
events = objects.NodeHistory.list_by_node_id(self.context, 11)
self.assertEqual(4, len(events))
self.assertEqual('oldthree', events[0].event)
self.assertEqual('one', events[1].event)
self.assertEqual('two', events[2].event)
self.assertEqual('three', events[3].event)
events = objects.NodeHistory.list_by_node_id(self.context, 12)
self.assertEqual(4, len(events))
self.assertEqual('oldthree', events[0].event)
self.assertEqual('one', events[1].event)
self.assertEqual('two', events[2].event)
self.assertEqual('three', events[3].event)

View File

@ -959,7 +959,7 @@ class NodeSoftPowerActionTestCase(db_base.DbTestCase):
self.assertIsNone(node['last_error'])
class DeployingErrorHandlerTestCase(tests_base.TestCase):
class DeployingErrorHandlerTestCase(db_base.DbTestCase):
def setUp(self):
super(DeployingErrorHandlerTestCase, self).setUp()
self.task = mock.Mock(spec=task_manager.TaskManager)
@ -972,6 +972,8 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.node.last_error = None
self.node.deploy_step = None
self.node.driver_internal_info = {}
self.node.id = obj_utils.create_test_node(self.context,
driver='fake-hardware').id
self.logmsg = "log message"
self.errmsg = "err message"
@ -1023,6 +1025,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.assertEqual({}, self.node.deploy_step)
self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
self.task.process_event.assert_called_once_with('fail')
self.assertIsNotNone(self.node.last_error)
def test_deploying_error_handler_cleanup_ironic_exception(self):
self._test_deploying_error_handler_cleanup(
@ -1058,7 +1061,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
self.task.process_event.assert_called_once_with('fail')
class ErrorHandlersTestCase(tests_base.TestCase):
class ErrorHandlersTestCase(db_base.DbTestCase):
def setUp(self):
super(ErrorHandlersTestCase, self).setUp()
self.task = mock.Mock(spec=task_manager.TaskManager)
@ -1070,9 +1073,13 @@ class ErrorHandlersTestCase(tests_base.TestCase):
# strict typing of the node power state fields and would fail if passed
# a Mock object in constructors. A task context is also required for
# notifications.
fake_node = obj_utils.create_test_node(self.context,
driver='fake-hardware')
self.node.configure_mock(power_state=states.POWER_OFF,
target_power_state=states.POWER_ON,
maintenance=False, maintenance_reason=None)
maintenance=False, maintenance_reason=None,
id=fake_node.id)
self.task.context = self.context
@mock.patch.object(conductor_utils, 'LOG', autospec=True)
@ -1401,6 +1408,7 @@ class ErrorHandlersTestCase(tests_base.TestCase):
'%(node)s, an exception was '
'encountered while aborting.',
{'node': self.node.uuid})
self.assertIsNotNone(self.node.last_error)
self.node.save.assert_called_once_with()
@mock.patch.object(conductor_utils.LOG, 'error', autospec=True)
@ -2562,3 +2570,60 @@ class GetConfigDriveImageTestCase(db_base.DbTestCase):
network_data=None,
user_data=b'{"user": "data"}',
vendor_data=None)
class NodeHistoryRecordTestCase(db_base.DbTestCase):
def setUp(self):
super(NodeHistoryRecordTestCase, self).setUp()
self.node = obj_utils.create_test_node(
self.context,
uuid=uuidutils.generate_uuid())
def test_record_node_history(self):
conductor_utils.node_history_record(self.node, event='meow')
entries = objects.NodeHistory.list_by_node_id(self.context,
self.node.id)
entry = entries[0]
self.assertEqual('meow', entry['event'])
self.assertEqual(CONF.host, entry['conductor'])
self.assertEqual('INFO', entry['severity'])
self.assertIsNone(entry['user'])
def test_record_node_history_with_user(self):
conductor_utils.node_history_record(self.node, event='meow',
user='peachesthecat')
entries = objects.NodeHistory.list_by_node_id(self.context,
self.node.id)
entry = entries[0]
self.assertEqual('meow', entry['event'])
self.assertEqual(CONF.host, entry['conductor'])
self.assertEqual('peachesthecat', entry['user'])
def test_record_node_history_with_error_severity(self):
conductor_utils.node_history_record(self.node, event='meowmeow',
error=True,
event_type='catwantfood')
entries = objects.NodeHistory.list_by_node_id(self.context,
self.node.id)
entry = entries[0]
self.assertEqual('meowmeow', entry['event'])
self.assertEqual(CONF.host, entry['conductor'])
self.assertEqual('ERROR', entry['severity'])
self.assertEqual('catwantfood', entry['event_type'])
@mock.patch.object(objects, 'NodeHistory', autospec=True)
def test_record_node_history_noop(self, mock_history):
CONF.set_override('node_history', False, group='conductor')
self.assertIsNone(conductor_utils.node_history_record(self.node))
mock_history.assert_not_called()
@mock.patch.object(objects, 'NodeHistory', autospec=True)
def test_record_node_history_disaled(self, mock_history):
mock_create = mock.Mock()
conductor_utils.node_history_record(self.node, event='meow',
error=True)
self.assertEqual('meow', self.node.last_error)
mock_history.create = mock_create
mock_history.assert_not_called()
mock_create.assert_not_called()

View File

@ -27,6 +27,7 @@ from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as conductor_utils
from ironic.conductor import verify
from ironic import objects
from ironic.tests.unit.conductor import mgr_utils
from ironic.tests.unit.db import base as db_base
from ironic.tests.unit.objects import utils as obj_utils
@ -138,6 +139,11 @@ class DoNodeVerifyTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
self.assertEqual(states.ENROLL, node.provision_state)
self.assertIsNone(node.target_provision_state)
self.assertTrue(node.last_error)
history = objects.NodeHistory.list_by_node_id(self.context,
node.id)
entry = history[0]
self.assertEqual('verify', entry['event_type'])
self.assertEqual('ERROR', entry['severity'])
@mock.patch.object(conductor_utils, 'LOG', autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakePower.validate',