Improve IPA logging and exception handling

Debugging the agent is a huge pain point. Tracebacks are rarely logged, error messages are often only returned via the API, and lack of info logging makes it hard to determine where some failures occur. Some errors only return a 500 with no error message or logs. Change-Id: I0a127de6e4abf62e20d5c5ad583ba46738604d2d
2015-06-10 10:46:39 -07:00 · 2015-06-10 10:46:39 -07:00 · 59281ecda8
commit 59281ecda8
parent c641371ace
5 changed files with 67 additions and 15 deletions
--- a/ironic_python_agent/api/controllers/v1/base.py
+++ b/ironic_python_agent/api/controllers/v1/base.py
@ -26,12 +26,12 @@ class ExceptionType(wtypes.UserType):
        return value

    def tobasetype(self, value):
-        """Turn a RESTError into a dict."""
+        """Turn an Exception into a dict."""
        return {
            'type': value.__class__.__name__,
-            'code': value.status_code,
-            'message': value.message,
-            'details': value.details,
+            'code': getattr(value, 'status_code', 500),
+            'message': str(value),
+            'details': getattr(value, 'details', ''),
        }

    frombasetype = tobasetype
--- a/ironic_python_agent/extensions/base.py
+++ b/ironic_python_agent/extensions/base.py
@ -24,6 +24,8 @@ from ironic_python_agent import encoding
 from ironic_python_agent import errors
 from ironic_python_agent import utils

+LOG = log.getLogger()
+

 class AgentCommandStatus(object):
    """Mapping of agent command statuses."""
@ -150,7 +152,8 @@ class AsyncCommandResult(BaseCommandResult):

            if isinstance(result, (bytes, six.text_type)):
                result = {'result': '{}: {}'.format(self.command_name, result)}
-
+            LOG.info('Command: %(name)s, result: %(result)s',
+                     {'name': self.command_name, 'result': result})
            with self.command_state_lock:
                self.command_result = result
                self.command_status = AgentCommandStatus.SUCCEEDED
@ -159,7 +162,11 @@ class AsyncCommandResult(BaseCommandResult):
                self.command_error = e
                self.command_status = AgentCommandStatus.CLEAN_VERSION_MISMATCH
                self.command_result = None
+            LOG.info('Clean version mismatch for command %s',
+                     self.command_name)
        except Exception as e:
+            LOG.exception('Command failed: %(name)s, error: %(err)s',
+                          {'name': self.command_name, 'err': e})
            if not isinstance(e, errors.RESTError):
                e = errors.CommandExecutionError(str(e))

@ -220,11 +227,16 @@ class ExecuteCommandMixin(object):
    def execute_command(self, command_name, **kwargs):
        """Execute an agent command."""
        with self.command_lock:
+            LOG.info('Executing command: %(name)s with args: %(args)s',
+                     {'name': command_name, 'args': kwargs})
            extension_part, command_part = self.split_command(command_name)

            if len(self.command_results) > 0:
                last_command = list(self.command_results.values())[-1]
                if not last_command.is_done():
+                    LOG.info('Tried to execute %(command)s, agent is still '
+                             'executing %(last)s', {'command': command_name,
+                                                    'last': last_command})
                    raise errors.CommandExecutionError('agent is busy')

            try:
@ -232,17 +244,21 @@ class ExecuteCommandMixin(object):
                result = ext.execute(command_part, **kwargs)
            except KeyError:
                # Extension Not found
+                LOG.exception('Extension %s not found', extension_part)
                raise errors.RequestedObjectNotFoundError('Extension',
                                                          extension_part)
            except errors.InvalidContentError as e:
                # Any command may raise a InvalidContentError which will be
                # returned to the caller directly.
+                LOG.exception('Invalid content error: %s', e)
                raise e
            except Exception as e:
                # Other errors are considered command execution errors, and are
-                # recorded as an
+                # recorded as a failed SyncCommandResult with an error message
+                LOG.exception('Command execution error: %s', e)
                result = SyncCommandResult(command_name, kwargs, False, e)
-
+            LOG.info('Command %(name)s completed: %(result)s',
+                     {'name': command_name, 'result': result})
            self.command_results[result.id] = result
            return result

--- a/ironic_python_agent/extensions/clean.py
+++ b/ironic_python_agent/extensions/clean.py
@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from oslo_log import log
+
 from ironic_python_agent import errors
 from ironic_python_agent.extensions import base
 from ironic_python_agent import hardware

+LOG = log.getLogger()
+

 class CleanExtension(base.BaseAgentExtension):
    @base.sync_command('get_clean_steps')
@ -28,10 +32,13 @@ class CleanExtension(base.BaseAgentExtension):
        :returns: A list of clean steps with keys step, priority, and
            reboot_requested
        """
+        LOG.debug('Getting clean steps, called with node: %(node)s, '
+                  'ports: %(ports)s', {'node': node, 'ports': ports})
        # Results should be a dict, not a list
        steps = hardware.dispatch_to_all_managers('get_clean_steps',
                                                  node, ports)

+        LOG.debug('Returning clean steps: %s', steps)
        return {
            'clean_steps': steps,
            'hardware_manager_version': _get_current_clean_version()
@ -52,17 +59,23 @@ class CleanExtension(base.BaseAgentExtension):
            the step returns.
        """
        # Ensure the agent is still the same version, or raise an exception
+        LOG.info('Executing clean step %s', step)
        _check_clean_version(clean_version)

        if 'step' not in step:
-            raise ValueError('Malformed clean_step, no "step" key: %s'.format(
-                step))
+            msg = 'Malformed clean_step, no "step" key: %s' % step
+            LOG.error(msg)
+            raise ValueError(msg)
        try:
            result = hardware.dispatch_to_managers(step['step'], node, ports)
        except Exception as e:
-            raise errors.CleaningError(
-                'Error performing clean_step %(step)s: %(err)s' %
-                {'step': step['step'], 'err': e})
+            msg = ('Error performing clean_step %(step)s: %(err)s' %
+                   {'step': step['step'], 'err': e})
+            LOG.exception(msg)
+            raise errors.CleaningError(msg)
+
+        LOG.info('Clean step completed: %(step)s, result: %(result)s',
+                 {'step': step, 'result': result})
        # Return the step that was executed so we can dispatch
        # to the appropriate Ironic interface
        return {
@ -78,6 +91,9 @@ def _check_clean_version(clean_version=None):
        return
    agent_version = _get_current_clean_version()
    if clean_version != agent_version:
+        LOG.warning('Mismatched clean versions. Agent version: %(agent), '
+                    'node version: %(node)', {'agent': agent_version,
+                                              'node': clean_version})
        raise errors.CleanVersionMismatch(agent_version=agent_version,
                                          node_version=clean_version)

--- a/ironic_python_agent/extensions/standby.py
+++ b/ironic_python_agent/extensions/standby.py
@ -207,26 +207,34 @@ class StandbyExtension(base.BaseAgentExtension):

    @base.async_command('cache_image', _validate_image_info)
    def cache_image(self, image_info=None, force=False):
+        LOG.debug('Caching image %s', image_info['id'])
        device = hardware.dispatch_to_managers('get_os_install_device')

        result_msg = 'image ({0}) already present on device {1}'

        if self.cached_image_id != image_info['id'] or force:
+            LOG.debug('Already had %s cached, overwriting',
+                      self.cached_image_id)
            _download_image(image_info)
            _write_image(image_info, device)
            self.cached_image_id = image_info['id']
            result_msg = 'image ({0}) cached to device {1}'

-        return result_msg.format(image_info['id'], device)
+        msg = result_msg.format(image_info['id'], device)
+        LOG.info(msg)
+        return msg

    @base.async_command('prepare_image', _validate_image_info)
    def prepare_image(self,
                      image_info=None,
                      configdrive=None):
+        LOG.debug('Preparing image %s', image_info['id'])
        device = hardware.dispatch_to_managers('get_os_install_device')

        # don't write image again if already cached
        if self.cached_image_id != image_info['id']:
+            LOG.debug('Already had %s cached, overwriting',
+                      self.cached_image_id)
            _download_image(image_info)
            _write_image(image_info, device)
            self.cached_image_id = image_info['id']
@ -234,8 +242,10 @@ class StandbyExtension(base.BaseAgentExtension):
        if configdrive is not None:
            _write_configdrive_to_partition(configdrive, device)

-        return 'image ({0}) written to device {1}'.format(image_info['id'],
-                                                          device)
+        msg = ('image ({0}) written to device {1}'.format(
+            image_info['id'], device))
+        LOG.info(msg)
+        return msg

    def _run_shutdown_script(self, parameter):
        script = _path_to_script('shell/shutdown.sh')
--- a/ironic_python_agent/hardware.py
+++ b/ironic_python_agent/hardware.py
@ -614,6 +614,11 @@ def dispatch_to_all_managers(method, *args, **kwargs):
                LOG.debug('HardwareManager {0} does not support {1}'
                          .format(manager, method))
                continue
+            except Exception as e:
+                LOG.exception('Unexpected error dispatching %(method)s to '
+                              'manager %(manager)s: %(e)s',
+                              {'method': method, 'manager': manager, 'e': e})
+                raise
            responses[manager.__class__.__name__] = response
        else:
            LOG.debug('HardwareManager {0} does not have method {1}'
@ -650,6 +655,11 @@ def dispatch_to_managers(method, *args, **kwargs):
            except(errors.IncompatibleHardwareMethodError):
                LOG.debug('HardwareManager {0} does not support {1}'
                        .format(manager, method))
+            except Exception as e:
+                LOG.exception('Unexpected error dispatching %(method)s to '
+                              'manager %(manager)s: %(e)s',
+                              {'method': method, 'manager': manager, 'e': e})
+                raise
        else:
            LOG.debug('HardwareManager {0} does not have method {1}'
                      .format(manager, method))