Improve IPA logging and exception handling

Debugging the agent is a huge pain point. Tracebacks are rarely logged, error messages are often only returned via the API, and lack of info logging makes it hard to determine where some failures occur. Some errors only return a 500 with no error message or logs. Change-Id: I0a127de6e4abf62e20d5c5ad583ba46738604d2d
2015-06-10 10:46:39 -07:00 · 2015-06-10 10:46:39 -07:00 · 59281ecda8
commit 59281ecda8
parent c641371ace
5 changed files with 67 additions and 15 deletions
--- a/ironic_python_agent/api/controllers/v1/base.py
+++ b/ironic_python_agent/api/controllers/v1/base.py
@ -26,12 +26,12 @@ class ExceptionType(wtypes.UserType):
        return value
    def tobasetype(self, value):
-        """Turn a RESTError into a dict."""
+        """Turn an Exception into a dict."""
        return {
            'type': value.__class__.__name__,
-            'code': value.status_code,
+            'code': getattr(value, 'status_code', 500),
-            'message': value.message,
+            'message': str(value),
-            'details': value.details,
+            'details': getattr(value, 'details', ''),
        }
    frombasetype = tobasetype
--- a/ironic_python_agent/extensions/base.py
+++ b/ironic_python_agent/extensions/base.py
@ -24,6 +24,8 @@ from ironic_python_agent import encoding
 from ironic_python_agent import errors
 from ironic_python_agent import utils
 LOG = log.getLogger()
 class AgentCommandStatus(object):
    """Mapping of agent command statuses."""
@ -150,7 +152,8 @@ class AsyncCommandResult(BaseCommandResult):
            if isinstance(result, (bytes, six.text_type)):
                result = {'result': '{}: {}'.format(self.command_name, result)}
-
+            LOG.info('Command: %(name)s, result: %(result)s',
                     {'name': self.command_name, 'result': result})
            with self.command_state_lock:
                self.command_result = result
                self.command_status = AgentCommandStatus.SUCCEEDED
@ -159,7 +162,11 @@ class AsyncCommandResult(BaseCommandResult):
                self.command_error = e
                self.command_status = AgentCommandStatus.CLEAN_VERSION_MISMATCH
                self.command_result = None
            LOG.info('Clean version mismatch for command %s',
                     self.command_name)
        except Exception as e:
            LOG.exception('Command failed: %(name)s, error: %(err)s',
                          {'name': self.command_name, 'err': e})
            if not isinstance(e, errors.RESTError):
                e = errors.CommandExecutionError(str(e))
@ -220,11 +227,16 @@ class ExecuteCommandMixin(object):
    def execute_command(self, command_name, **kwargs):
        """Execute an agent command."""
        with self.command_lock:
            LOG.info('Executing command: %(name)s with args: %(args)s',
                     {'name': command_name, 'args': kwargs})
            extension_part, command_part = self.split_command(command_name)
            if len(self.command_results) > 0:
                last_command = list(self.command_results.values())[-1]
                if not last_command.is_done():
                    LOG.info('Tried to execute %(command)s, agent is still '
                             'executing %(last)s', {'command': command_name,
                                                    'last': last_command})
                    raise errors.CommandExecutionError('agent is busy')
            try:
@ -232,17 +244,21 @@ class ExecuteCommandMixin(object):
                result = ext.execute(command_part, **kwargs)
            except KeyError:
                # Extension Not found
                LOG.exception('Extension %s not found', extension_part)
                raise errors.RequestedObjectNotFoundError('Extension',
                                                          extension_part)
            except errors.InvalidContentError as e:
                # Any command may raise a InvalidContentError which will be
                # returned to the caller directly.
                LOG.exception('Invalid content error: %s', e)
                raise e
            except Exception as e:
                # Other errors are considered command execution errors, and are
-                # recorded as an
+                # recorded as a failed SyncCommandResult with an error message
                LOG.exception('Command execution error: %s', e)
                result = SyncCommandResult(command_name, kwargs, False, e)
-
+            LOG.info('Command %(name)s completed: %(result)s',
                     {'name': command_name, 'result': result})
            self.command_results[result.id] = result
            return result
--- a/ironic_python_agent/extensions/clean.py
+++ b/ironic_python_agent/extensions/clean.py
@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from oslo_log import log
 from ironic_python_agent import errors
 from ironic_python_agent.extensions import base
 from ironic_python_agent import hardware
 LOG = log.getLogger()
 class CleanExtension(base.BaseAgentExtension):
    @base.sync_command('get_clean_steps')
@ -28,10 +32,13 @@ class CleanExtension(base.BaseAgentExtension):
        :returns: A list of clean steps with keys step, priority, and
            reboot_requested
        """
        LOG.debug('Getting clean steps, called with node: %(node)s, '
                  'ports: %(ports)s', {'node': node, 'ports': ports})
        # Results should be a dict, not a list
        steps = hardware.dispatch_to_all_managers('get_clean_steps',
                                                  node, ports)
        LOG.debug('Returning clean steps: %s', steps)
        return {
            'clean_steps': steps,
            'hardware_manager_version': _get_current_clean_version()
@ -52,17 +59,23 @@ class CleanExtension(base.BaseAgentExtension):
            the step returns.
        """
        # Ensure the agent is still the same version, or raise an exception
        LOG.info('Executing clean step %s', step)
        _check_clean_version(clean_version)
        if 'step' not in step:
-            raise ValueError('Malformed clean_step, no "step" key: %s'.format(
+            msg = 'Malformed clean_step, no "step" key: %s' % step
-                step))
+            LOG.error(msg)
            raise ValueError(msg)
        try:
            result = hardware.dispatch_to_managers(step['step'], node, ports)
        except Exception as e:
-            raise errors.CleaningError(
+            msg = ('Error performing clean_step %(step)s: %(err)s' %
-                'Error performing clean_step %(step)s: %(err)s' %
+                   {'step': step['step'], 'err': e})
-                {'step': step['step'], 'err': e})
+            LOG.exception(msg)
            raise errors.CleaningError(msg)
        LOG.info('Clean step completed: %(step)s, result: %(result)s',
                 {'step': step, 'result': result})
        # Return the step that was executed so we can dispatch
        # to the appropriate Ironic interface
        return {
@ -78,6 +91,9 @@ def _check_clean_version(clean_version=None):
        return
    agent_version = _get_current_clean_version()
    if clean_version != agent_version:
        LOG.warning('Mismatched clean versions. Agent version: %(agent), '
                    'node version: %(node)', {'agent': agent_version,
                                              'node': clean_version})
        raise errors.CleanVersionMismatch(agent_version=agent_version,
                                          node_version=clean_version)
--- a/ironic_python_agent/extensions/standby.py
+++ b/ironic_python_agent/extensions/standby.py
@ -207,26 +207,34 @@ class StandbyExtension(base.BaseAgentExtension):
    @base.async_command('cache_image', _validate_image_info)
    def cache_image(self, image_info=None, force=False):
        LOG.debug('Caching image %s', image_info['id'])
        device = hardware.dispatch_to_managers('get_os_install_device')
        result_msg = 'image ({0}) already present on device {1}'
        if self.cached_image_id != image_info['id'] or force:
            LOG.debug('Already had %s cached, overwriting',
                      self.cached_image_id)
            _download_image(image_info)
            _write_image(image_info, device)
            self.cached_image_id = image_info['id']
            result_msg = 'image ({0}) cached to device {1}'
-        return result_msg.format(image_info['id'], device)
+        msg = result_msg.format(image_info['id'], device)
        LOG.info(msg)
        return msg
    @base.async_command('prepare_image', _validate_image_info)
    def prepare_image(self,
                      image_info=None,
                      configdrive=None):
        LOG.debug('Preparing image %s', image_info['id'])
        device = hardware.dispatch_to_managers('get_os_install_device')
        # don't write image again if already cached
        if self.cached_image_id != image_info['id']:
            LOG.debug('Already had %s cached, overwriting',
                      self.cached_image_id)
            _download_image(image_info)
            _write_image(image_info, device)
            self.cached_image_id = image_info['id']
@ -234,8 +242,10 @@ class StandbyExtension(base.BaseAgentExtension):
        if configdrive is not None:
            _write_configdrive_to_partition(configdrive, device)
-        return 'image ({0}) written to device {1}'.format(image_info['id'],
+        msg = ('image ({0}) written to device {1}'.format(
-                                                          device)
+            image_info['id'], device))
        LOG.info(msg)
        return msg
    def _run_shutdown_script(self, parameter):
        script = _path_to_script('shell/shutdown.sh')
--- a/ironic_python_agent/hardware.py
+++ b/ironic_python_agent/hardware.py
@ -614,6 +614,11 @@ def dispatch_to_all_managers(method, *args, **kwargs):
                LOG.debug('HardwareManager {0} does not support {1}'
                          .format(manager, method))
                continue
            except Exception as e:
                LOG.exception('Unexpected error dispatching %(method)s to '
                              'manager %(manager)s: %(e)s',
                              {'method': method, 'manager': manager, 'e': e})
                raise
            responses[manager.__class__.__name__] = response
        else:
            LOG.debug('HardwareManager {0} does not have method {1}'
@ -650,6 +655,11 @@ def dispatch_to_managers(method, *args, **kwargs):
            except(errors.IncompatibleHardwareMethodError):
                LOG.debug('HardwareManager {0} does not support {1}'
                        .format(manager, method))
            except Exception as e:
                LOG.exception('Unexpected error dispatching %(method)s to '
                              'manager %(manager)s: %(e)s',
                              {'method': method, 'manager': manager, 'e': e})
                raise
        else:
            LOG.debug('HardwareManager {0} does not have method {1}'
                      .format(manager, method))