Improve IPA logging and exception handling

Debugging the agent is a huge pain point. Tracebacks are rarely logged,
error messages are often only returned via the API, and lack of
info logging makes it hard to determine where some failures occur.
Some errors only return a 500 with no error message or logs.

Change-Id: I0a127de6e4abf62e20d5c5ad583ba46738604d2d
This commit is contained in:
Josh Gachnang 2015-06-10 10:46:39 -07:00
parent c641371ace
commit 59281ecda8
5 changed files with 67 additions and 15 deletions
ironic_python_agent

@ -26,12 +26,12 @@ class ExceptionType(wtypes.UserType):
return value
def tobasetype(self, value):
"""Turn a RESTError into a dict."""
"""Turn an Exception into a dict."""
return {
'type': value.__class__.__name__,
'code': value.status_code,
'message': value.message,
'details': value.details,
'code': getattr(value, 'status_code', 500),
'message': str(value),
'details': getattr(value, 'details', ''),
}
frombasetype = tobasetype

@ -24,6 +24,8 @@ from ironic_python_agent import encoding
from ironic_python_agent import errors
from ironic_python_agent import utils
LOG = log.getLogger()
class AgentCommandStatus(object):
"""Mapping of agent command statuses."""
@ -150,7 +152,8 @@ class AsyncCommandResult(BaseCommandResult):
if isinstance(result, (bytes, six.text_type)):
result = {'result': '{}: {}'.format(self.command_name, result)}
LOG.info('Command: %(name)s, result: %(result)s',
{'name': self.command_name, 'result': result})
with self.command_state_lock:
self.command_result = result
self.command_status = AgentCommandStatus.SUCCEEDED
@ -159,7 +162,11 @@ class AsyncCommandResult(BaseCommandResult):
self.command_error = e
self.command_status = AgentCommandStatus.CLEAN_VERSION_MISMATCH
self.command_result = None
LOG.info('Clean version mismatch for command %s',
self.command_name)
except Exception as e:
LOG.exception('Command failed: %(name)s, error: %(err)s',
{'name': self.command_name, 'err': e})
if not isinstance(e, errors.RESTError):
e = errors.CommandExecutionError(str(e))
@ -220,11 +227,16 @@ class ExecuteCommandMixin(object):
def execute_command(self, command_name, **kwargs):
"""Execute an agent command."""
with self.command_lock:
LOG.info('Executing command: %(name)s with args: %(args)s',
{'name': command_name, 'args': kwargs})
extension_part, command_part = self.split_command(command_name)
if len(self.command_results) > 0:
last_command = list(self.command_results.values())[-1]
if not last_command.is_done():
LOG.info('Tried to execute %(command)s, agent is still '
'executing %(last)s', {'command': command_name,
'last': last_command})
raise errors.CommandExecutionError('agent is busy')
try:
@ -232,17 +244,21 @@ class ExecuteCommandMixin(object):
result = ext.execute(command_part, **kwargs)
except KeyError:
# Extension Not found
LOG.exception('Extension %s not found', extension_part)
raise errors.RequestedObjectNotFoundError('Extension',
extension_part)
except errors.InvalidContentError as e:
# Any command may raise a InvalidContentError which will be
# returned to the caller directly.
LOG.exception('Invalid content error: %s', e)
raise e
except Exception as e:
# Other errors are considered command execution errors, and are
# recorded as an
# recorded as a failed SyncCommandResult with an error message
LOG.exception('Command execution error: %s', e)
result = SyncCommandResult(command_name, kwargs, False, e)
LOG.info('Command %(name)s completed: %(result)s',
{'name': command_name, 'result': result})
self.command_results[result.id] = result
return result

@ -12,10 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from oslo_log import log
from ironic_python_agent import errors
from ironic_python_agent.extensions import base
from ironic_python_agent import hardware
LOG = log.getLogger()
class CleanExtension(base.BaseAgentExtension):
@base.sync_command('get_clean_steps')
@ -28,10 +32,13 @@ class CleanExtension(base.BaseAgentExtension):
:returns: A list of clean steps with keys step, priority, and
reboot_requested
"""
LOG.debug('Getting clean steps, called with node: %(node)s, '
'ports: %(ports)s', {'node': node, 'ports': ports})
# Results should be a dict, not a list
steps = hardware.dispatch_to_all_managers('get_clean_steps',
node, ports)
LOG.debug('Returning clean steps: %s', steps)
return {
'clean_steps': steps,
'hardware_manager_version': _get_current_clean_version()
@ -52,17 +59,23 @@ class CleanExtension(base.BaseAgentExtension):
the step returns.
"""
# Ensure the agent is still the same version, or raise an exception
LOG.info('Executing clean step %s', step)
_check_clean_version(clean_version)
if 'step' not in step:
raise ValueError('Malformed clean_step, no "step" key: %s'.format(
step))
msg = 'Malformed clean_step, no "step" key: %s' % step
LOG.error(msg)
raise ValueError(msg)
try:
result = hardware.dispatch_to_managers(step['step'], node, ports)
except Exception as e:
raise errors.CleaningError(
'Error performing clean_step %(step)s: %(err)s' %
{'step': step['step'], 'err': e})
msg = ('Error performing clean_step %(step)s: %(err)s' %
{'step': step['step'], 'err': e})
LOG.exception(msg)
raise errors.CleaningError(msg)
LOG.info('Clean step completed: %(step)s, result: %(result)s',
{'step': step, 'result': result})
# Return the step that was executed so we can dispatch
# to the appropriate Ironic interface
return {
@ -78,6 +91,9 @@ def _check_clean_version(clean_version=None):
return
agent_version = _get_current_clean_version()
if clean_version != agent_version:
LOG.warning('Mismatched clean versions. Agent version: %(agent), '
'node version: %(node)', {'agent': agent_version,
'node': clean_version})
raise errors.CleanVersionMismatch(agent_version=agent_version,
node_version=clean_version)

@ -207,26 +207,34 @@ class StandbyExtension(base.BaseAgentExtension):
@base.async_command('cache_image', _validate_image_info)
def cache_image(self, image_info=None, force=False):
LOG.debug('Caching image %s', image_info['id'])
device = hardware.dispatch_to_managers('get_os_install_device')
result_msg = 'image ({0}) already present on device {1}'
if self.cached_image_id != image_info['id'] or force:
LOG.debug('Already had %s cached, overwriting',
self.cached_image_id)
_download_image(image_info)
_write_image(image_info, device)
self.cached_image_id = image_info['id']
result_msg = 'image ({0}) cached to device {1}'
return result_msg.format(image_info['id'], device)
msg = result_msg.format(image_info['id'], device)
LOG.info(msg)
return msg
@base.async_command('prepare_image', _validate_image_info)
def prepare_image(self,
image_info=None,
configdrive=None):
LOG.debug('Preparing image %s', image_info['id'])
device = hardware.dispatch_to_managers('get_os_install_device')
# don't write image again if already cached
if self.cached_image_id != image_info['id']:
LOG.debug('Already had %s cached, overwriting',
self.cached_image_id)
_download_image(image_info)
_write_image(image_info, device)
self.cached_image_id = image_info['id']
@ -234,8 +242,10 @@ class StandbyExtension(base.BaseAgentExtension):
if configdrive is not None:
_write_configdrive_to_partition(configdrive, device)
return 'image ({0}) written to device {1}'.format(image_info['id'],
device)
msg = ('image ({0}) written to device {1}'.format(
image_info['id'], device))
LOG.info(msg)
return msg
def _run_shutdown_script(self, parameter):
script = _path_to_script('shell/shutdown.sh')

@ -614,6 +614,11 @@ def dispatch_to_all_managers(method, *args, **kwargs):
LOG.debug('HardwareManager {0} does not support {1}'
.format(manager, method))
continue
except Exception as e:
LOG.exception('Unexpected error dispatching %(method)s to '
'manager %(manager)s: %(e)s',
{'method': method, 'manager': manager, 'e': e})
raise
responses[manager.__class__.__name__] = response
else:
LOG.debug('HardwareManager {0} does not have method {1}'
@ -650,6 +655,11 @@ def dispatch_to_managers(method, *args, **kwargs):
except(errors.IncompatibleHardwareMethodError):
LOG.debug('HardwareManager {0} does not support {1}'
.format(manager, method))
except Exception as e:
LOG.exception('Unexpected error dispatching %(method)s to '
'manager %(manager)s: %(e)s',
{'method': method, 'manager': manager, 'e': e})
raise
else:
LOG.debug('HardwareManager {0} does not have method {1}'
.format(manager, method))