Improve IPA logging and exception handling

Debugging the agent is a huge pain point. Tracebacks are rarely logged,
error messages are often only returned via the API, and lack of
info logging makes it hard to determine where some failures occur.
Some errors only return a 500 with no error message or logs.

Change-Id: I0a127de6e4abf62e20d5c5ad583ba46738604d2d
This commit is contained in:
Josh Gachnang 2015-06-10 10:46:39 -07:00
parent c641371ace
commit 59281ecda8
5 changed files with 67 additions and 15 deletions
ironic_python_agent

@ -26,12 +26,12 @@ class ExceptionType(wtypes.UserType):
return value return value
def tobasetype(self, value): def tobasetype(self, value):
"""Turn a RESTError into a dict.""" """Turn an Exception into a dict."""
return { return {
'type': value.__class__.__name__, 'type': value.__class__.__name__,
'code': value.status_code, 'code': getattr(value, 'status_code', 500),
'message': value.message, 'message': str(value),
'details': value.details, 'details': getattr(value, 'details', ''),
} }
frombasetype = tobasetype frombasetype = tobasetype

@ -24,6 +24,8 @@ from ironic_python_agent import encoding
from ironic_python_agent import errors from ironic_python_agent import errors
from ironic_python_agent import utils from ironic_python_agent import utils
LOG = log.getLogger()
class AgentCommandStatus(object): class AgentCommandStatus(object):
"""Mapping of agent command statuses.""" """Mapping of agent command statuses."""
@ -150,7 +152,8 @@ class AsyncCommandResult(BaseCommandResult):
if isinstance(result, (bytes, six.text_type)): if isinstance(result, (bytes, six.text_type)):
result = {'result': '{}: {}'.format(self.command_name, result)} result = {'result': '{}: {}'.format(self.command_name, result)}
LOG.info('Command: %(name)s, result: %(result)s',
{'name': self.command_name, 'result': result})
with self.command_state_lock: with self.command_state_lock:
self.command_result = result self.command_result = result
self.command_status = AgentCommandStatus.SUCCEEDED self.command_status = AgentCommandStatus.SUCCEEDED
@ -159,7 +162,11 @@ class AsyncCommandResult(BaseCommandResult):
self.command_error = e self.command_error = e
self.command_status = AgentCommandStatus.CLEAN_VERSION_MISMATCH self.command_status = AgentCommandStatus.CLEAN_VERSION_MISMATCH
self.command_result = None self.command_result = None
LOG.info('Clean version mismatch for command %s',
self.command_name)
except Exception as e: except Exception as e:
LOG.exception('Command failed: %(name)s, error: %(err)s',
{'name': self.command_name, 'err': e})
if not isinstance(e, errors.RESTError): if not isinstance(e, errors.RESTError):
e = errors.CommandExecutionError(str(e)) e = errors.CommandExecutionError(str(e))
@ -220,11 +227,16 @@ class ExecuteCommandMixin(object):
def execute_command(self, command_name, **kwargs): def execute_command(self, command_name, **kwargs):
"""Execute an agent command.""" """Execute an agent command."""
with self.command_lock: with self.command_lock:
LOG.info('Executing command: %(name)s with args: %(args)s',
{'name': command_name, 'args': kwargs})
extension_part, command_part = self.split_command(command_name) extension_part, command_part = self.split_command(command_name)
if len(self.command_results) > 0: if len(self.command_results) > 0:
last_command = list(self.command_results.values())[-1] last_command = list(self.command_results.values())[-1]
if not last_command.is_done(): if not last_command.is_done():
LOG.info('Tried to execute %(command)s, agent is still '
'executing %(last)s', {'command': command_name,
'last': last_command})
raise errors.CommandExecutionError('agent is busy') raise errors.CommandExecutionError('agent is busy')
try: try:
@ -232,17 +244,21 @@ class ExecuteCommandMixin(object):
result = ext.execute(command_part, **kwargs) result = ext.execute(command_part, **kwargs)
except KeyError: except KeyError:
# Extension Not found # Extension Not found
LOG.exception('Extension %s not found', extension_part)
raise errors.RequestedObjectNotFoundError('Extension', raise errors.RequestedObjectNotFoundError('Extension',
extension_part) extension_part)
except errors.InvalidContentError as e: except errors.InvalidContentError as e:
# Any command may raise a InvalidContentError which will be # Any command may raise a InvalidContentError which will be
# returned to the caller directly. # returned to the caller directly.
LOG.exception('Invalid content error: %s', e)
raise e raise e
except Exception as e: except Exception as e:
# Other errors are considered command execution errors, and are # Other errors are considered command execution errors, and are
# recorded as an # recorded as a failed SyncCommandResult with an error message
LOG.exception('Command execution error: %s', e)
result = SyncCommandResult(command_name, kwargs, False, e) result = SyncCommandResult(command_name, kwargs, False, e)
LOG.info('Command %(name)s completed: %(result)s',
{'name': command_name, 'result': result})
self.command_results[result.id] = result self.command_results[result.id] = result
return result return result

@ -12,10 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from oslo_log import log
from ironic_python_agent import errors from ironic_python_agent import errors
from ironic_python_agent.extensions import base from ironic_python_agent.extensions import base
from ironic_python_agent import hardware from ironic_python_agent import hardware
LOG = log.getLogger()
class CleanExtension(base.BaseAgentExtension): class CleanExtension(base.BaseAgentExtension):
@base.sync_command('get_clean_steps') @base.sync_command('get_clean_steps')
@ -28,10 +32,13 @@ class CleanExtension(base.BaseAgentExtension):
:returns: A list of clean steps with keys step, priority, and :returns: A list of clean steps with keys step, priority, and
reboot_requested reboot_requested
""" """
LOG.debug('Getting clean steps, called with node: %(node)s, '
'ports: %(ports)s', {'node': node, 'ports': ports})
# Results should be a dict, not a list # Results should be a dict, not a list
steps = hardware.dispatch_to_all_managers('get_clean_steps', steps = hardware.dispatch_to_all_managers('get_clean_steps',
node, ports) node, ports)
LOG.debug('Returning clean steps: %s', steps)
return { return {
'clean_steps': steps, 'clean_steps': steps,
'hardware_manager_version': _get_current_clean_version() 'hardware_manager_version': _get_current_clean_version()
@ -52,17 +59,23 @@ class CleanExtension(base.BaseAgentExtension):
the step returns. the step returns.
""" """
# Ensure the agent is still the same version, or raise an exception # Ensure the agent is still the same version, or raise an exception
LOG.info('Executing clean step %s', step)
_check_clean_version(clean_version) _check_clean_version(clean_version)
if 'step' not in step: if 'step' not in step:
raise ValueError('Malformed clean_step, no "step" key: %s'.format( msg = 'Malformed clean_step, no "step" key: %s' % step
step)) LOG.error(msg)
raise ValueError(msg)
try: try:
result = hardware.dispatch_to_managers(step['step'], node, ports) result = hardware.dispatch_to_managers(step['step'], node, ports)
except Exception as e: except Exception as e:
raise errors.CleaningError( msg = ('Error performing clean_step %(step)s: %(err)s' %
'Error performing clean_step %(step)s: %(err)s' % {'step': step['step'], 'err': e})
{'step': step['step'], 'err': e}) LOG.exception(msg)
raise errors.CleaningError(msg)
LOG.info('Clean step completed: %(step)s, result: %(result)s',
{'step': step, 'result': result})
# Return the step that was executed so we can dispatch # Return the step that was executed so we can dispatch
# to the appropriate Ironic interface # to the appropriate Ironic interface
return { return {
@ -78,6 +91,9 @@ def _check_clean_version(clean_version=None):
return return
agent_version = _get_current_clean_version() agent_version = _get_current_clean_version()
if clean_version != agent_version: if clean_version != agent_version:
LOG.warning('Mismatched clean versions. Agent version: %(agent), '
'node version: %(node)', {'agent': agent_version,
'node': clean_version})
raise errors.CleanVersionMismatch(agent_version=agent_version, raise errors.CleanVersionMismatch(agent_version=agent_version,
node_version=clean_version) node_version=clean_version)

@ -207,26 +207,34 @@ class StandbyExtension(base.BaseAgentExtension):
@base.async_command('cache_image', _validate_image_info) @base.async_command('cache_image', _validate_image_info)
def cache_image(self, image_info=None, force=False): def cache_image(self, image_info=None, force=False):
LOG.debug('Caching image %s', image_info['id'])
device = hardware.dispatch_to_managers('get_os_install_device') device = hardware.dispatch_to_managers('get_os_install_device')
result_msg = 'image ({0}) already present on device {1}' result_msg = 'image ({0}) already present on device {1}'
if self.cached_image_id != image_info['id'] or force: if self.cached_image_id != image_info['id'] or force:
LOG.debug('Already had %s cached, overwriting',
self.cached_image_id)
_download_image(image_info) _download_image(image_info)
_write_image(image_info, device) _write_image(image_info, device)
self.cached_image_id = image_info['id'] self.cached_image_id = image_info['id']
result_msg = 'image ({0}) cached to device {1}' result_msg = 'image ({0}) cached to device {1}'
return result_msg.format(image_info['id'], device) msg = result_msg.format(image_info['id'], device)
LOG.info(msg)
return msg
@base.async_command('prepare_image', _validate_image_info) @base.async_command('prepare_image', _validate_image_info)
def prepare_image(self, def prepare_image(self,
image_info=None, image_info=None,
configdrive=None): configdrive=None):
LOG.debug('Preparing image %s', image_info['id'])
device = hardware.dispatch_to_managers('get_os_install_device') device = hardware.dispatch_to_managers('get_os_install_device')
# don't write image again if already cached # don't write image again if already cached
if self.cached_image_id != image_info['id']: if self.cached_image_id != image_info['id']:
LOG.debug('Already had %s cached, overwriting',
self.cached_image_id)
_download_image(image_info) _download_image(image_info)
_write_image(image_info, device) _write_image(image_info, device)
self.cached_image_id = image_info['id'] self.cached_image_id = image_info['id']
@ -234,8 +242,10 @@ class StandbyExtension(base.BaseAgentExtension):
if configdrive is not None: if configdrive is not None:
_write_configdrive_to_partition(configdrive, device) _write_configdrive_to_partition(configdrive, device)
return 'image ({0}) written to device {1}'.format(image_info['id'], msg = ('image ({0}) written to device {1}'.format(
device) image_info['id'], device))
LOG.info(msg)
return msg
def _run_shutdown_script(self, parameter): def _run_shutdown_script(self, parameter):
script = _path_to_script('shell/shutdown.sh') script = _path_to_script('shell/shutdown.sh')

@ -614,6 +614,11 @@ def dispatch_to_all_managers(method, *args, **kwargs):
LOG.debug('HardwareManager {0} does not support {1}' LOG.debug('HardwareManager {0} does not support {1}'
.format(manager, method)) .format(manager, method))
continue continue
except Exception as e:
LOG.exception('Unexpected error dispatching %(method)s to '
'manager %(manager)s: %(e)s',
{'method': method, 'manager': manager, 'e': e})
raise
responses[manager.__class__.__name__] = response responses[manager.__class__.__name__] = response
else: else:
LOG.debug('HardwareManager {0} does not have method {1}' LOG.debug('HardwareManager {0} does not have method {1}'
@ -650,6 +655,11 @@ def dispatch_to_managers(method, *args, **kwargs):
except(errors.IncompatibleHardwareMethodError): except(errors.IncompatibleHardwareMethodError):
LOG.debug('HardwareManager {0} does not support {1}' LOG.debug('HardwareManager {0} does not support {1}'
.format(manager, method)) .format(manager, method))
except Exception as e:
LOG.exception('Unexpected error dispatching %(method)s to '
'manager %(manager)s: %(e)s',
{'method': method, 'manager': manager, 'e': e})
raise
else: else:
LOG.debug('HardwareManager {0} does not have method {1}' LOG.debug('HardwareManager {0} does not have method {1}'
.format(manager, method)) .format(manager, method))