Refactor async helper methods in conductor/manager.py
This patch performs multiple refactorings of conductor/manager.py:
- move a few class methods closer to where they are invoked,
to improve readability
- move and rename async helper methods out of the ConductorManager class:
_do_node_deploy, _do_node_teardown,
_provisioning_error_handler, _power_state_error_handler,
_do_sync_power_state
- do_node_deploy now sets conductor affinity after error checking,
though this should have no functional impact
- Moving _do_sync_power_state out of the main class required
returning the count instead of updating the counter directly.
- Consolidated a LOG.warning and LOG.error message within
_do_sync_power_state into a single LOG.error. Both were being logged
within the same exception block.
- Refactored _do_sync_power_state to improve readability, and in the
process of this refactoring, a bug in the unit tests was found:
fail_change=True was not properly tested.
- Corrects issue with max_retries running an extra attempt.
A future patch should move these methods into conductor/utils.py.
Related-To: blueprint new-ironic-state-machine
Change-Id: I07497228cafab73df12253a5e652069a0a9918cb
This commit is contained in:
@@ -254,6 +254,32 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
"""Periodic tasks are run at pre-specified interval."""
|
||||
return self.run_periodic_tasks(context, raise_on_error=raise_on_error)
|
||||
|
||||
@lockutils.synchronized(WORKER_SPAWN_lOCK, 'ironic-')
|
||||
def _spawn_worker(self, func, *args, **kwargs):
|
||||
|
||||
"""Create a greenthread to run func(*args, **kwargs).
|
||||
|
||||
Spawns a greenthread if there are free slots in pool, otherwise raises
|
||||
exception. Execution control returns immediately to the caller.
|
||||
|
||||
:returns: GreenThread object.
|
||||
:raises: NoFreeConductorWorker if worker pool is currently full.
|
||||
|
||||
"""
|
||||
if self._worker_pool.free():
|
||||
return self._worker_pool.spawn(func, *args, **kwargs)
|
||||
else:
|
||||
raise exception.NoFreeConductorWorker()
|
||||
|
||||
def _conductor_service_record_keepalive(self):
|
||||
while not self._keepalive_evt.is_set():
|
||||
try:
|
||||
self.dbapi.touch_conductor(self.host)
|
||||
except db_exception.DBConnectionError:
|
||||
LOG.warning(_LW('Conductor could not connect to database '
|
||||
'while heartbeating.'))
|
||||
self._keepalive_evt.wait(CONF.conductor.heartbeat_interval)
|
||||
|
||||
@messaging.expected_exceptions(exception.InvalidParameterValue,
|
||||
exception.MissingParameterValue,
|
||||
exception.NodeLocked,
|
||||
@@ -303,27 +329,6 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
|
||||
return node_obj
|
||||
|
||||
def _power_state_error_handler(self, e, node, power_state):
|
||||
"""Set the node's power states if error occurs.
|
||||
|
||||
This hook gets called upon an execption being raised when spawning
|
||||
the worker thread to change the power state of a node.
|
||||
|
||||
:param e: the exception object that was raised.
|
||||
:param node: an Ironic node object.
|
||||
:param power_state: the power state to set on the node.
|
||||
|
||||
"""
|
||||
if isinstance(e, exception.NoFreeConductorWorker):
|
||||
node.power_state = power_state
|
||||
node.target_power_state = states.NOSTATE
|
||||
node.last_error = (_("No free conductor workers available"))
|
||||
node.save()
|
||||
LOG.warning(_LW("No free conductor workers available to perform "
|
||||
"an action on node %(node)s, setting node's "
|
||||
"power state back to %(power_state)s."),
|
||||
{'node': node.uuid, 'power_state': power_state})
|
||||
|
||||
@messaging.expected_exceptions(exception.InvalidParameterValue,
|
||||
exception.MissingParameterValue,
|
||||
exception.NoFreeConductorWorker,
|
||||
@@ -359,7 +364,7 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
task.node.target_power_state = new_state
|
||||
task.node.last_error = None
|
||||
task.node.save()
|
||||
task.set_spawn_error_hook(self._power_state_error_handler,
|
||||
task.set_spawn_error_hook(power_state_error_handler,
|
||||
task.node, task.node.power_state)
|
||||
task.spawn_after(self._spawn_worker, utils.node_power_action,
|
||||
task, new_state)
|
||||
@@ -548,13 +553,6 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
|
||||
return (ret, is_async)
|
||||
|
||||
def _get_vendor_passthru_metadata(self, route_dict):
|
||||
d = {}
|
||||
for method, metadata in route_dict.iteritems():
|
||||
# 'func' is the vendor method reference, ignore it
|
||||
d[method] = {k: metadata[k] for k in metadata if k != 'func'}
|
||||
return d
|
||||
|
||||
@messaging.expected_exceptions(exception.UnsupportedDriverExtension)
|
||||
def get_node_vendor_passthru_methods(self, context, node_id):
|
||||
"""Retrieve information about vendor methods of the given node.
|
||||
@@ -572,8 +570,8 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
driver=task.node.driver,
|
||||
extension='vendor interface')
|
||||
|
||||
return self._get_vendor_passthru_metadata(
|
||||
task.driver.vendor.vendor_routes)
|
||||
return get_vendor_passthru_metadata(
|
||||
task.driver.vendor.vendor_routes)
|
||||
|
||||
@messaging.expected_exceptions(exception.UnsupportedDriverExtension,
|
||||
exception.DriverNotFound)
|
||||
@@ -595,36 +593,7 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
driver=driver_name,
|
||||
extension='vendor interface')
|
||||
|
||||
return self._get_vendor_passthru_metadata(driver.vendor.driver_routes)
|
||||
|
||||
def _provisioning_error_handler(self, e, node, provision_state,
|
||||
target_provision_state):
|
||||
"""Set the node's provisioning states if error occurs.
|
||||
|
||||
This hook gets called upon an exception being raised when spawning
|
||||
the worker to do the deployment or tear down of a node.
|
||||
|
||||
:param e: the exception object that was raised.
|
||||
:param node: an Ironic node object.
|
||||
:param provision_state: the provision state to be set on
|
||||
the node.
|
||||
:param target_provision_state: the target provision state to be
|
||||
set on the node.
|
||||
|
||||
"""
|
||||
if isinstance(e, exception.NoFreeConductorWorker):
|
||||
# NOTE(deva): there is no need to clear conductor_affinity
|
||||
# because it isn't updated on a failed deploy
|
||||
node.provision_state = provision_state
|
||||
node.target_provision_state = target_provision_state
|
||||
node.last_error = (_("No free conductor workers available"))
|
||||
node.save()
|
||||
LOG.warning(_LW("No free conductor workers available to perform "
|
||||
"an action on node %(node)s, setting node's "
|
||||
"provision_state back to %(prov_state)s and "
|
||||
"target_provision_state to %(tgt_prov_state)s."),
|
||||
{'node': node.uuid, 'prov_state': provision_state,
|
||||
'tgt_prov_state': target_provision_state})
|
||||
return get_vendor_passthru_metadata(driver.vendor.driver_routes)
|
||||
|
||||
@messaging.expected_exceptions(exception.NoFreeConductorWorker,
|
||||
exception.NodeLocked,
|
||||
@@ -694,45 +663,11 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
'node': node.uuid,
|
||||
'state': node.provision_state})
|
||||
else:
|
||||
task.set_spawn_error_hook(self._provisioning_error_handler,
|
||||
task.set_spawn_error_hook(provisioning_error_handler,
|
||||
node, previous_prov_state,
|
||||
previous_tgt_provision_state)
|
||||
task.spawn_after(self._spawn_worker,
|
||||
self._do_node_deploy, task)
|
||||
|
||||
def _do_node_deploy(self, task):
|
||||
"""Prepare the environment and deploy a node."""
|
||||
node = task.node
|
||||
try:
|
||||
task.driver.deploy.prepare(task)
|
||||
new_state = task.driver.deploy.deploy(task)
|
||||
|
||||
# Update conductor_affinity to reference this conductor's ID
|
||||
# since there may be local persistent state
|
||||
node.conductor_affinity = self.conductor.id
|
||||
except Exception as e:
|
||||
# NOTE(deva): there is no need to clear conductor_affinity
|
||||
with excutils.save_and_reraise_exception():
|
||||
task.process_event('fail')
|
||||
LOG.warning(_LW('Error in deploy of node %(node)s: %(err)s'),
|
||||
{'node': task.node.uuid, 'err': e})
|
||||
node.last_error = _("Failed to deploy. Error: %s") % e
|
||||
else:
|
||||
# NOTE(deva): Some drivers may return states.DEPLOYWAIT
|
||||
# eg. if they are waiting for a callback
|
||||
if new_state == states.DEPLOYDONE:
|
||||
task.process_event('done')
|
||||
LOG.info(_LI('Successfully deployed node %(node)s with '
|
||||
'instance %(instance)s.'),
|
||||
{'node': node.uuid, 'instance': node.instance_uuid})
|
||||
elif new_state == states.DEPLOYWAIT:
|
||||
task.process_event('wait')
|
||||
else:
|
||||
LOG.error(_LE('Unexpected state %(state)s returned while '
|
||||
'deploying node %(node)s.'),
|
||||
{'state': new_state, 'node': node.uuid})
|
||||
finally:
|
||||
node.save()
|
||||
do_node_deploy, task, self.conductor.id)
|
||||
|
||||
@messaging.expected_exceptions(exception.NoFreeConductorWorker,
|
||||
exception.NodeLocked,
|
||||
@@ -783,153 +718,11 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
"not allowed for node %(node)s in state %(state)s")
|
||||
% {'node': node_id, 'state': node.provision_state})
|
||||
else:
|
||||
task.set_spawn_error_hook(self._provisioning_error_handler,
|
||||
task.set_spawn_error_hook(provisioning_error_handler,
|
||||
node, previous_prov_state,
|
||||
previous_tgt_provision_state)
|
||||
task.spawn_after(self._spawn_worker,
|
||||
self._do_node_tear_down, task)
|
||||
|
||||
def _do_node_tear_down(self, task):
|
||||
"""Internal RPC method to tear down an existing node deployment."""
|
||||
node = task.node
|
||||
try:
|
||||
task.driver.deploy.clean_up(task)
|
||||
task.driver.deploy.tear_down(task)
|
||||
except Exception as e:
|
||||
with excutils.save_and_reraise_exception():
|
||||
LOG.warning(_LW('Error in tear_down of node %(node)s: '
|
||||
'%(err)s'),
|
||||
{'node': task.node.uuid, 'err': e})
|
||||
node.last_error = _("Failed to tear down. Error: %s") % e
|
||||
task.process_event('error')
|
||||
else:
|
||||
# NOTE(deva): When tear_down finishes, the deletion is done
|
||||
task.process_event('done')
|
||||
LOG.info(_LI('Successfully unprovisioned node %(node)s with '
|
||||
'instance %(instance)s.'),
|
||||
{'node': node.uuid, 'instance': node.instance_uuid})
|
||||
# NOTE(deva): Currently, NOSTATE is represented as None
|
||||
# However, FSM class treats a target_state of None as
|
||||
# the lack of a target state -- not a target of NOSTATE
|
||||
# Thus, until we migrate to an explicit AVAILABLE state
|
||||
# we need to clear the target_state here manually.
|
||||
node.target_provision_state = None
|
||||
finally:
|
||||
# NOTE(deva): there is no need to unset conductor_affinity
|
||||
# because it is a reference to the most recent conductor which
|
||||
# deployed a node, and does not limit any future actions.
|
||||
# But we do need to clear the instance_info
|
||||
node.instance_info = {}
|
||||
node.save()
|
||||
|
||||
def _conductor_service_record_keepalive(self):
|
||||
while not self._keepalive_evt.is_set():
|
||||
try:
|
||||
self.dbapi.touch_conductor(self.host)
|
||||
except db_exception.DBConnectionError:
|
||||
LOG.warning(_LW('Conductor could not connect to database '
|
||||
'while heartbeating.'))
|
||||
self._keepalive_evt.wait(CONF.conductor.heartbeat_interval)
|
||||
|
||||
def _handle_sync_power_state_max_retries_exceeded(self, task,
|
||||
actual_power_state):
|
||||
node = task.node
|
||||
msg = (_("During sync_power_state, max retries exceeded "
|
||||
"for node %(node)s, node state %(actual)s "
|
||||
"does not match expected state '%(state)s'. "
|
||||
"Updating DB state to '%(actual)s' "
|
||||
"Switching node to maintenance mode.") %
|
||||
{'node': node.uuid, 'actual': actual_power_state,
|
||||
'state': node.power_state})
|
||||
node.power_state = actual_power_state
|
||||
node.last_error = msg
|
||||
node.maintenance = True
|
||||
node.maintenance_reason = msg
|
||||
node.save()
|
||||
LOG.error(msg)
|
||||
|
||||
def _do_sync_power_state(self, task):
|
||||
node = task.node
|
||||
power_state = None
|
||||
|
||||
# Power driver info should be set properly for new node, otherwise
|
||||
# prevent node from switching to maintenance mode.
|
||||
if node.power_state is None:
|
||||
try:
|
||||
task.driver.power.validate(task)
|
||||
except (exception.InvalidParameterValue,
|
||||
exception.MissingParameterValue):
|
||||
return
|
||||
|
||||
try:
|
||||
power_state = task.driver.power.get_power_state(task)
|
||||
if power_state == states.ERROR:
|
||||
raise exception.PowerStateFailure(_("Driver returns ERROR"
|
||||
" state."))
|
||||
except Exception as e:
|
||||
LOG.warning(_LW("During sync_power_state, could not get power "
|
||||
"state for node %(node)s. Error: %(err)s."),
|
||||
{'node': node.uuid, 'err': e})
|
||||
self.power_state_sync_count[node.uuid] += 1
|
||||
|
||||
if (self.power_state_sync_count[node.uuid] >=
|
||||
CONF.conductor.power_state_sync_max_retries):
|
||||
self._handle_sync_power_state_max_retries_exceeded(task,
|
||||
power_state)
|
||||
return
|
||||
|
||||
if node.power_state is None:
|
||||
LOG.info(_LI("During sync_power_state, node %(node)s has no "
|
||||
"previous known state. Recording current state "
|
||||
"'%(state)s'."),
|
||||
{'node': node.uuid, 'state': power_state})
|
||||
node.power_state = power_state
|
||||
node.save()
|
||||
|
||||
if power_state == node.power_state:
|
||||
if node.uuid in self.power_state_sync_count:
|
||||
del self.power_state_sync_count[node.uuid]
|
||||
return
|
||||
|
||||
if not CONF.conductor.force_power_state_during_sync:
|
||||
LOG.warning(_LW("During sync_power_state, node %(node)s state "
|
||||
"does not match expected state '%(state)s'. "
|
||||
"Updating recorded state to '%(actual)s'."),
|
||||
{'node': node.uuid, 'actual': power_state,
|
||||
'state': node.power_state})
|
||||
node.power_state = power_state
|
||||
node.save()
|
||||
return
|
||||
|
||||
if (self.power_state_sync_count[node.uuid] >=
|
||||
CONF.conductor.power_state_sync_max_retries):
|
||||
self._handle_sync_power_state_max_retries_exceeded(task,
|
||||
power_state)
|
||||
return
|
||||
|
||||
# Force actual power_state of node equal to DB power_state of node
|
||||
LOG.warning(_LW("During sync_power_state, node %(node)s state "
|
||||
"'%(actual)s' does not match expected state. "
|
||||
"Changing hardware state to '%(state)s'."),
|
||||
{'node': node.uuid, 'actual': power_state,
|
||||
'state': node.power_state})
|
||||
try:
|
||||
# node_power_action will update the node record
|
||||
# so don't do that again here.
|
||||
utils.node_power_action(task, node.power_state)
|
||||
except Exception as e:
|
||||
LOG.error(_LE("Failed to change power state of node %(node)s "
|
||||
"to '%(state)s'."), {'node': node.uuid,
|
||||
'state': node.power_state})
|
||||
attempts_left = (CONF.conductor.power_state_sync_max_retries -
|
||||
self.power_state_sync_count[node.uuid]) - 1
|
||||
LOG.warning(_LW("%(left)s attempts remaining to "
|
||||
"sync_power_state for node %(node)s"),
|
||||
{'left': attempts_left,
|
||||
'node': node.uuid})
|
||||
finally:
|
||||
# Update power state sync count for current node
|
||||
self.power_state_sync_count[node.uuid] += 1
|
||||
do_node_tear_down, task)
|
||||
|
||||
@periodic_task.periodic_task(
|
||||
spacing=CONF.conductor.sync_power_state_interval)
|
||||
@@ -971,14 +764,27 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
try:
|
||||
if not self._mapped_to_this_conductor(node_uuid, driver):
|
||||
continue
|
||||
# NOTE(deva): we should not acquire a lock on a node in
|
||||
# DEPLOYWAIT, as this could cause an error within
|
||||
# a deploy ramdisk POSTing back at the same time.
|
||||
# TODO(deva): refactor this check, because it needs to be done
|
||||
# in every periodic task, not just this one.
|
||||
node = objects.Node.get_by_id(context, node_id)
|
||||
if (node.provision_state == states.DEPLOYWAIT or
|
||||
node.maintenance or node.reservation is not None):
|
||||
continue
|
||||
|
||||
with task_manager.acquire(context, node_id) as task:
|
||||
if (task.node.provision_state != states.DEPLOYWAIT and
|
||||
not task.node.maintenance):
|
||||
self._do_sync_power_state(task)
|
||||
if (task.node.provision_state == states.DEPLOYWAIT or
|
||||
task.node.maintenance):
|
||||
continue
|
||||
count = do_sync_power_state(
|
||||
task, self.power_state_sync_count[node_uuid])
|
||||
if count:
|
||||
self.power_state_sync_count[node_uuid] = count
|
||||
else:
|
||||
# don't bloat the dict with non-failing nodes
|
||||
del self.power_state_sync_count[node_uuid]
|
||||
except exception.NodeNotFound:
|
||||
LOG.info(_LI("During sync_power_state, node %(node)s was not "
|
||||
"found and presumed deleted by another process."),
|
||||
@@ -1151,23 +957,6 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
ret_dict[iface_name]['reason'] = reason
|
||||
return ret_dict
|
||||
|
||||
@lockutils.synchronized(WORKER_SPAWN_lOCK, 'ironic-')
|
||||
def _spawn_worker(self, func, *args, **kwargs):
|
||||
|
||||
"""Create a greenthread to run func(*args, **kwargs).
|
||||
|
||||
Spawns a greenthread if there are free slots in pool, otherwise raises
|
||||
exception. Execution control returns immediately to the caller.
|
||||
|
||||
:returns: GreenThread object.
|
||||
:raises: NoFreeConductorWorker if worker pool is currently full.
|
||||
|
||||
"""
|
||||
if self._worker_pool.free():
|
||||
return self._worker_pool.spawn(func, *args, **kwargs)
|
||||
else:
|
||||
raise exception.NoFreeConductorWorker()
|
||||
|
||||
@messaging.expected_exceptions(exception.NodeLocked,
|
||||
exception.NodeAssociated,
|
||||
exception.NodeInWrongPowerState)
|
||||
@@ -1511,3 +1300,241 @@ class ConductorManager(periodic_task.PeriodicTasks):
|
||||
raise exception.UnsupportedDriverExtension(
|
||||
driver=task.node.driver, extension='management')
|
||||
return task.driver.management.get_supported_boot_devices()
|
||||
|
||||
|
||||
def get_vendor_passthru_metadata(route_dict):
|
||||
d = {}
|
||||
for method, metadata in route_dict.iteritems():
|
||||
# 'func' is the vendor method reference, ignore it
|
||||
d[method] = {k: metadata[k] for k in metadata if k != 'func'}
|
||||
return d
|
||||
|
||||
|
||||
def power_state_error_handler(e, node, power_state):
|
||||
"""Set the node's power states if error occurs.
|
||||
|
||||
This hook gets called upon an execption being raised when spawning
|
||||
the worker thread to change the power state of a node.
|
||||
|
||||
:param e: the exception object that was raised.
|
||||
:param node: an Ironic node object.
|
||||
:param power_state: the power state to set on the node.
|
||||
|
||||
"""
|
||||
if isinstance(e, exception.NoFreeConductorWorker):
|
||||
node.power_state = power_state
|
||||
node.target_power_state = states.NOSTATE
|
||||
node.last_error = (_("No free conductor workers available"))
|
||||
node.save()
|
||||
LOG.warning(_LW("No free conductor workers available to perform "
|
||||
"an action on node %(node)s, setting node's "
|
||||
"power state back to %(power_state)s."),
|
||||
{'node': node.uuid, 'power_state': power_state})
|
||||
|
||||
|
||||
def provisioning_error_handler(e, node, provision_state,
|
||||
target_provision_state):
|
||||
"""Set the node's provisioning states if error occurs.
|
||||
|
||||
This hook gets called upon an exception being raised when spawning
|
||||
the worker to do the deployment or tear down of a node.
|
||||
|
||||
:param e: the exception object that was raised.
|
||||
:param node: an Ironic node object.
|
||||
:param provision_state: the provision state to be set on
|
||||
the node.
|
||||
:param target_provision_state: the target provision state to be
|
||||
set on the node.
|
||||
|
||||
"""
|
||||
if isinstance(e, exception.NoFreeConductorWorker):
|
||||
# NOTE(deva): there is no need to clear conductor_affinity
|
||||
# because it isn't updated on a failed deploy
|
||||
node.provision_state = provision_state
|
||||
node.target_provision_state = target_provision_state
|
||||
node.last_error = (_("No free conductor workers available"))
|
||||
node.save()
|
||||
LOG.warning(_LW("No free conductor workers available to perform "
|
||||
"an action on node %(node)s, setting node's "
|
||||
"provision_state back to %(prov_state)s and "
|
||||
"target_provision_state to %(tgt_prov_state)s."),
|
||||
{'node': node.uuid, 'prov_state': provision_state,
|
||||
'tgt_prov_state': target_provision_state})
|
||||
|
||||
|
||||
def do_node_deploy(task, conductor_id):
|
||||
"""Prepare the environment and deploy a node."""
|
||||
node = task.node
|
||||
try:
|
||||
task.driver.deploy.prepare(task)
|
||||
new_state = task.driver.deploy.deploy(task)
|
||||
except Exception as e:
|
||||
# NOTE(deva): there is no need to clear conductor_affinity
|
||||
with excutils.save_and_reraise_exception():
|
||||
task.process_event('fail')
|
||||
LOG.warning(_LW('Error in deploy of node %(node)s: %(err)s'),
|
||||
{'node': task.node.uuid, 'err': e})
|
||||
node.last_error = _("Failed to deploy. Error: %s") % e
|
||||
else:
|
||||
# Update conductor_affinity to reference this conductor's ID
|
||||
# since there may be local persistent state
|
||||
node.conductor_affinity = conductor_id
|
||||
|
||||
# NOTE(deva): Some drivers may return states.DEPLOYWAIT
|
||||
# eg. if they are waiting for a callback
|
||||
if new_state == states.DEPLOYDONE:
|
||||
task.process_event('done')
|
||||
LOG.info(_LI('Successfully deployed node %(node)s with '
|
||||
'instance %(instance)s.'),
|
||||
{'node': node.uuid, 'instance': node.instance_uuid})
|
||||
elif new_state == states.DEPLOYWAIT:
|
||||
task.process_event('wait')
|
||||
else:
|
||||
LOG.error(_LE('Unexpected state %(state)s returned while '
|
||||
'deploying node %(node)s.'),
|
||||
{'state': new_state, 'node': node.uuid})
|
||||
finally:
|
||||
node.save()
|
||||
|
||||
|
||||
def do_node_tear_down(task):
|
||||
"""Internal RPC method to tear down an existing node deployment."""
|
||||
node = task.node
|
||||
try:
|
||||
task.driver.deploy.clean_up(task)
|
||||
task.driver.deploy.tear_down(task)
|
||||
except Exception as e:
|
||||
with excutils.save_and_reraise_exception():
|
||||
LOG.warning(_LW('Error in tear_down of node %(node)s: '
|
||||
'%(err)s'),
|
||||
{'node': task.node.uuid, 'err': e})
|
||||
node.last_error = _("Failed to tear down. Error: %s") % e
|
||||
task.process_event('error')
|
||||
else:
|
||||
# NOTE(deva): When tear_down finishes, the deletion is done
|
||||
task.process_event('done')
|
||||
LOG.info(_LI('Successfully unprovisioned node %(node)s with '
|
||||
'instance %(instance)s.'),
|
||||
{'node': node.uuid, 'instance': node.instance_uuid})
|
||||
# NOTE(deva): Currently, NOSTATE is represented as None
|
||||
# However, FSM class treats a target_state of None as
|
||||
# the lack of a target state -- not a target of NOSTATE
|
||||
# Thus, until we migrate to an explicit AVAILABLE state
|
||||
# we need to clear the target_state here manually.
|
||||
node.target_provision_state = None
|
||||
finally:
|
||||
# NOTE(deva): there is no need to unset conductor_affinity
|
||||
# because it is a reference to the most recent conductor which
|
||||
# deployed a node, and does not limit any future actions.
|
||||
# But we do need to clear the instance_info
|
||||
node.instance_info = {}
|
||||
node.save()
|
||||
|
||||
|
||||
def handle_sync_power_state_max_retries_exceeded(task,
|
||||
actual_power_state):
|
||||
node = task.node
|
||||
msg = (_("During sync_power_state, max retries exceeded "
|
||||
"for node %(node)s, node state %(actual)s "
|
||||
"does not match expected state '%(state)s'. "
|
||||
"Updating DB state to '%(actual)s' "
|
||||
"Switching node to maintenance mode.") %
|
||||
{'node': node.uuid, 'actual': actual_power_state,
|
||||
'state': node.power_state})
|
||||
node.power_state = actual_power_state
|
||||
node.last_error = msg
|
||||
node.maintenance = True
|
||||
node.maintenance_reason = msg
|
||||
node.save()
|
||||
LOG.error(msg)
|
||||
|
||||
|
||||
def do_sync_power_state(task, count):
|
||||
"""Sync the power state for this node, incrementing the counter on failure.
|
||||
|
||||
When the limit of power_state_sync_max_retries is reached, the node is put
|
||||
into maintenance mode and the error recorded.
|
||||
|
||||
:param task: a TaskManager instance with an exclusive lock
|
||||
:param count: number of times this node has previously failed a sync
|
||||
:returns: Count of failed attempts.
|
||||
On success, the counter is set to 0.
|
||||
On failure, the count is incremented by one
|
||||
"""
|
||||
node = task.node
|
||||
power_state = None
|
||||
count += 1
|
||||
|
||||
# If power driver info can not be validated, and node has no prior state,
|
||||
# do not attempt to sync the node's power state.
|
||||
if node.power_state is None:
|
||||
try:
|
||||
task.driver.power.validate(task)
|
||||
except (exception.InvalidParameterValue,
|
||||
exception.MissingParameterValue):
|
||||
return 0
|
||||
|
||||
try:
|
||||
# The driver may raise an exception, or may return ERROR.
|
||||
# Handle both the same way.
|
||||
power_state = task.driver.power.get_power_state(task)
|
||||
if power_state == states.ERROR:
|
||||
raise exception.PowerStateFailure(
|
||||
_("Power driver returned ERROR state "
|
||||
"while trying to sync power state."))
|
||||
except Exception as e:
|
||||
# Stop if any exception is raised when getting the power state
|
||||
LOG.warning(_LW("During sync_power_state, could not get power "
|
||||
"state for node %(node)s. Error: %(err)s."),
|
||||
{'node': node.uuid, 'err': e})
|
||||
if count > CONF.conductor.power_state_sync_max_retries:
|
||||
handle_sync_power_state_max_retries_exceeded(task, power_state)
|
||||
return count
|
||||
else:
|
||||
# If node has no prior state AND we successfully got a state,
|
||||
# simply record that.
|
||||
if node.power_state is None:
|
||||
LOG.info(_LI("During sync_power_state, node %(node)s has no "
|
||||
"previous known state. Recording current state "
|
||||
"'%(state)s'."),
|
||||
{'node': node.uuid, 'state': power_state})
|
||||
node.power_state = power_state
|
||||
node.save()
|
||||
|
||||
# If the node is now in the expected state, reset the counter
|
||||
# otherwise, if we've exceeded the retry limit, stop here
|
||||
if node.power_state == power_state:
|
||||
return 0
|
||||
else:
|
||||
if count > CONF.conductor.power_state_sync_max_retries:
|
||||
handle_sync_power_state_max_retries_exceeded(task, power_state)
|
||||
return count
|
||||
|
||||
if CONF.conductor.force_power_state_during_sync:
|
||||
LOG.warning(_LW("During sync_power_state, node %(node)s state "
|
||||
"'%(actual)s' does not match expected state. "
|
||||
"Changing hardware state to '%(state)s'."),
|
||||
{'node': node.uuid, 'actual': power_state,
|
||||
'state': node.power_state})
|
||||
try:
|
||||
# node_power_action will update the node record
|
||||
# so don't do that again here.
|
||||
utils.node_power_action(task, node.power_state)
|
||||
except Exception as e:
|
||||
attempts_left = (CONF.conductor.power_state_sync_max_retries -
|
||||
count)
|
||||
LOG.error(_LE("Failed to change power state of node %(node)s "
|
||||
"to '%(state)s'. Attempts left: %(left)s."),
|
||||
{'node': node.uuid,
|
||||
'state': node.power_state,
|
||||
'left': attempts_left})
|
||||
else:
|
||||
LOG.warning(_LW("During sync_power_state, node %(node)s state "
|
||||
"does not match expected state '%(state)s'. "
|
||||
"Updating recorded state to '%(actual)s'."),
|
||||
{'node': node.uuid, 'actual': power_state,
|
||||
'state': node.power_state})
|
||||
node.power_state = power_state
|
||||
node.save()
|
||||
|
||||
return count
|
||||
|
||||
@@ -922,6 +922,7 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.deploy')
|
||||
def test__do_node_deploy_driver_raises_error(self, mock_deploy):
|
||||
self._start_service()
|
||||
# test when driver.deploy.deploy raises an exception
|
||||
mock_deploy.side_effect = exception.InstanceDeployFailure('test')
|
||||
node = obj_utils.create_test_node(self.context, driver='fake',
|
||||
@@ -929,7 +930,8 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
|
||||
task = task_manager.TaskManager(self.context, node.uuid)
|
||||
|
||||
self.assertRaises(exception.InstanceDeployFailure,
|
||||
self.service._do_node_deploy, task)
|
||||
manager.do_node_deploy, task,
|
||||
self.service.conductor.id)
|
||||
node.refresh()
|
||||
self.assertEqual(states.DEPLOYFAIL, node.provision_state)
|
||||
self.assertEqual(states.NOSTATE, node.target_provision_state)
|
||||
@@ -945,7 +947,8 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
|
||||
provision_state=states.DEPLOYING)
|
||||
task = task_manager.TaskManager(self.context, node.uuid)
|
||||
|
||||
self.service._do_node_deploy(task)
|
||||
manager.do_node_deploy(task,
|
||||
self.service.conductor.id)
|
||||
node.refresh()
|
||||
self.assertEqual(states.ACTIVE, node.provision_state)
|
||||
self.assertEqual(states.NOSTATE, node.target_provision_state)
|
||||
@@ -970,7 +973,7 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
|
||||
self.assertIsNone(node.last_error)
|
||||
# Verify reservation has been cleared.
|
||||
self.assertIsNone(node.reservation)
|
||||
mock_spawn.assert_called_once_with(mock.ANY, mock.ANY)
|
||||
mock_spawn.assert_called_once_with(mock.ANY, mock.ANY, mock.ANY)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.deploy')
|
||||
def test_do_node_deploy_rebuild_active_state(self, mock_deploy):
|
||||
@@ -1099,7 +1102,7 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
|
||||
self._start_service()
|
||||
mock_tear_down.side_effect = exception.InstanceDeployFailure('test')
|
||||
self.assertRaises(exception.InstanceDeployFailure,
|
||||
self.service._do_node_tear_down, task)
|
||||
manager.do_node_tear_down, task)
|
||||
node.refresh()
|
||||
self.assertEqual(states.ERROR, node.provision_state)
|
||||
self.assertEqual(states.NOSTATE, node.target_provision_state)
|
||||
@@ -1118,7 +1121,7 @@ class DoNodeDeployTearDownTestCase(_ServiceSetUpMixin,
|
||||
task = task_manager.TaskManager(self.context, node.uuid)
|
||||
self._start_service()
|
||||
mock_tear_down.return_value = states.DELETED
|
||||
self.service._do_node_tear_down(task)
|
||||
manager.do_node_tear_down(task)
|
||||
node.refresh()
|
||||
self.assertEqual(states.NOSTATE, node.provision_state)
|
||||
self.assertEqual(states.NOSTATE, node.target_provision_state)
|
||||
@@ -1698,22 +1701,22 @@ class ManagerDoSyncPowerStateTestCase(tests_db_base.DbTestCase):
|
||||
self.config(force_power_state_during_sync=False, group='conductor')
|
||||
|
||||
def _do_sync_power_state(self, old_power_state, new_power_states,
|
||||
fail_validate=False, fail_change=False):
|
||||
fail_validate=False):
|
||||
self.node.power_state = old_power_state
|
||||
if not isinstance(new_power_states, (list, tuple)):
|
||||
new_power_states = [new_power_states]
|
||||
if fail_validate:
|
||||
exc = exception.InvalidParameterValue('error')
|
||||
self.power.validate.side_effect = exc
|
||||
if fail_change:
|
||||
exc = exception.IronicException('test')
|
||||
self.power.node_power_action.side_effect = exc
|
||||
for new_power_state in new_power_states:
|
||||
self.node.power_state = old_power_state
|
||||
if isinstance(new_power_state, Exception):
|
||||
self.power.get_power_state.side_effect = new_power_state
|
||||
else:
|
||||
self.power.get_power_state.return_value = new_power_state
|
||||
self.service._do_sync_power_state(self.task)
|
||||
count = manager.do_sync_power_state(self.task,
|
||||
self.service.power_state_sync_count[self.node.uuid])
|
||||
self.service.power_state_sync_count[self.node.uuid] = count
|
||||
|
||||
def test_state_unchanged(self, node_power_action):
|
||||
self._do_sync_power_state('fake-power', 'fake-power')
|
||||
@@ -1789,8 +1792,8 @@ class ManagerDoSyncPowerStateTestCase(tests_db_base.DbTestCase):
|
||||
def test_state_changed_sync_failed(self, node_power_action):
|
||||
self.config(force_power_state_during_sync=True, group='conductor')
|
||||
|
||||
self._do_sync_power_state(states.POWER_ON, states.POWER_OFF,
|
||||
fail_change=True)
|
||||
node_power_action.side_effect = exception.IronicException('test')
|
||||
self._do_sync_power_state(states.POWER_ON, states.POWER_OFF)
|
||||
|
||||
# Just testing that this test doesn't raise.
|
||||
self.assertFalse(self.power.validate.called)
|
||||
@@ -1815,7 +1818,7 @@ class ManagerDoSyncPowerStateTestCase(tests_db_base.DbTestCase):
|
||||
self.node.save.assert_called_once_with()
|
||||
node_power_action.assert_called_once_with(self.task, states.POWER_ON)
|
||||
self.assertEqual(states.POWER_OFF, self.node.power_state)
|
||||
self.assertEqual(1,
|
||||
self.assertEqual(2,
|
||||
self.service.power_state_sync_count[self.node.uuid])
|
||||
self.assertTrue(self.node.maintenance)
|
||||
self.assertIsNotNone(self.node.maintenance_reason)
|
||||
@@ -1836,7 +1839,7 @@ class ManagerDoSyncPowerStateTestCase(tests_db_base.DbTestCase):
|
||||
npa_exp_calls = [mock.call(self.task, states.POWER_ON)] * 2
|
||||
self.assertEqual(npa_exp_calls, node_power_action.call_args_list)
|
||||
self.assertEqual(states.POWER_OFF, self.node.power_state)
|
||||
self.assertEqual(2,
|
||||
self.assertEqual(3,
|
||||
self.service.power_state_sync_count[self.node.uuid])
|
||||
self.assertTrue(self.node.maintenance)
|
||||
|
||||
@@ -1856,16 +1859,17 @@ class ManagerDoSyncPowerStateTestCase(tests_db_base.DbTestCase):
|
||||
npa_exp_calls = [mock.call(self.task, states.POWER_ON)] * 2
|
||||
self.assertEqual(npa_exp_calls, node_power_action.call_args_list)
|
||||
self.assertEqual(states.POWER_ON, self.node.power_state)
|
||||
self.assertNotIn(self.node.uuid, self.service.power_state_sync_count)
|
||||
self.assertEqual(0,
|
||||
self.service.power_state_sync_count[self.node.uuid])
|
||||
|
||||
def test_power_state_sync_max_retries_gps_exception(self,
|
||||
node_power_action):
|
||||
self.config(power_state_sync_max_retries=2, group='conductor')
|
||||
self.service.power_state_sync_count[self.node.uuid] = 2
|
||||
|
||||
node_power_action.side_effect = exception.IronicException('test')
|
||||
self._do_sync_power_state('fake',
|
||||
exception.IronicException('foo'),
|
||||
fail_change=True)
|
||||
exception.IronicException('foo'))
|
||||
|
||||
self.assertFalse(self.power.validate.called)
|
||||
self.power.get_power_state.assert_called_once_with(self.task)
|
||||
@@ -1877,7 +1881,7 @@ class ManagerDoSyncPowerStateTestCase(tests_db_base.DbTestCase):
|
||||
self.assertFalse(node_power_action.called)
|
||||
|
||||
|
||||
@mock.patch.object(manager.ConductorManager, '_do_sync_power_state')
|
||||
@mock.patch.object(manager, 'do_sync_power_state')
|
||||
@mock.patch.object(task_manager, 'acquire')
|
||||
@mock.patch.object(manager.ConductorManager, '_mapped_to_this_conductor')
|
||||
@mock.patch.object(objects.Node, 'get_by_id')
|
||||
@@ -2066,7 +2070,7 @@ class ManagerSyncPowerStatesTestCase(_CommonMixIn, tests_db_base.DbTestCase):
|
||||
self.node.driver)
|
||||
get_node_mock.assert_called_once_with(self.context, self.node.id)
|
||||
acquire_mock.assert_called_once_with(self.context, self.node.id)
|
||||
sync_mock.assert_called_once_with(task)
|
||||
sync_mock.assert_called_once_with(task, mock.ANY)
|
||||
|
||||
def test__sync_power_state_multiple_nodes(self, get_nodeinfo_mock,
|
||||
get_node_mock, mapped_mock,
|
||||
@@ -2139,7 +2143,8 @@ class ManagerSyncPowerStatesTestCase(_CommonMixIn, tests_db_base.DbTestCase):
|
||||
acquire_calls = [mock.call(self.context, x.id)
|
||||
for x in nodes[:1] + nodes[6:]]
|
||||
self.assertEqual(acquire_calls, acquire_mock.call_args_list)
|
||||
sync_calls = [mock.call(tasks[0]), mock.call(tasks[5])]
|
||||
sync_calls = [mock.call(tasks[0], mock.ANY),
|
||||
mock.call(tasks[5], mock.ANY)]
|
||||
self.assertEqual(sync_calls, sync_mock.call_args_list)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user