Improve IPA logging and exception handling
Debugging the agent is a huge pain point. Tracebacks are rarely logged, error messages are often only returned via the API, and lack of info logging makes it hard to determine where some failures occur. Some errors only return a 500 with no error message or logs. Change-Id: I0a127de6e4abf62e20d5c5ad583ba46738604d2d
This commit is contained in:
parent
c641371ace
commit
59281ecda8
ironic_python_agent
@ -26,12 +26,12 @@ class ExceptionType(wtypes.UserType):
|
||||
return value
|
||||
|
||||
def tobasetype(self, value):
|
||||
"""Turn a RESTError into a dict."""
|
||||
"""Turn an Exception into a dict."""
|
||||
return {
|
||||
'type': value.__class__.__name__,
|
||||
'code': value.status_code,
|
||||
'message': value.message,
|
||||
'details': value.details,
|
||||
'code': getattr(value, 'status_code', 500),
|
||||
'message': str(value),
|
||||
'details': getattr(value, 'details', ''),
|
||||
}
|
||||
|
||||
frombasetype = tobasetype
|
||||
|
@ -24,6 +24,8 @@ from ironic_python_agent import encoding
|
||||
from ironic_python_agent import errors
|
||||
from ironic_python_agent import utils
|
||||
|
||||
LOG = log.getLogger()
|
||||
|
||||
|
||||
class AgentCommandStatus(object):
|
||||
"""Mapping of agent command statuses."""
|
||||
@ -150,7 +152,8 @@ class AsyncCommandResult(BaseCommandResult):
|
||||
|
||||
if isinstance(result, (bytes, six.text_type)):
|
||||
result = {'result': '{}: {}'.format(self.command_name, result)}
|
||||
|
||||
LOG.info('Command: %(name)s, result: %(result)s',
|
||||
{'name': self.command_name, 'result': result})
|
||||
with self.command_state_lock:
|
||||
self.command_result = result
|
||||
self.command_status = AgentCommandStatus.SUCCEEDED
|
||||
@ -159,7 +162,11 @@ class AsyncCommandResult(BaseCommandResult):
|
||||
self.command_error = e
|
||||
self.command_status = AgentCommandStatus.CLEAN_VERSION_MISMATCH
|
||||
self.command_result = None
|
||||
LOG.info('Clean version mismatch for command %s',
|
||||
self.command_name)
|
||||
except Exception as e:
|
||||
LOG.exception('Command failed: %(name)s, error: %(err)s',
|
||||
{'name': self.command_name, 'err': e})
|
||||
if not isinstance(e, errors.RESTError):
|
||||
e = errors.CommandExecutionError(str(e))
|
||||
|
||||
@ -220,11 +227,16 @@ class ExecuteCommandMixin(object):
|
||||
def execute_command(self, command_name, **kwargs):
|
||||
"""Execute an agent command."""
|
||||
with self.command_lock:
|
||||
LOG.info('Executing command: %(name)s with args: %(args)s',
|
||||
{'name': command_name, 'args': kwargs})
|
||||
extension_part, command_part = self.split_command(command_name)
|
||||
|
||||
if len(self.command_results) > 0:
|
||||
last_command = list(self.command_results.values())[-1]
|
||||
if not last_command.is_done():
|
||||
LOG.info('Tried to execute %(command)s, agent is still '
|
||||
'executing %(last)s', {'command': command_name,
|
||||
'last': last_command})
|
||||
raise errors.CommandExecutionError('agent is busy')
|
||||
|
||||
try:
|
||||
@ -232,17 +244,21 @@ class ExecuteCommandMixin(object):
|
||||
result = ext.execute(command_part, **kwargs)
|
||||
except KeyError:
|
||||
# Extension Not found
|
||||
LOG.exception('Extension %s not found', extension_part)
|
||||
raise errors.RequestedObjectNotFoundError('Extension',
|
||||
extension_part)
|
||||
except errors.InvalidContentError as e:
|
||||
# Any command may raise a InvalidContentError which will be
|
||||
# returned to the caller directly.
|
||||
LOG.exception('Invalid content error: %s', e)
|
||||
raise e
|
||||
except Exception as e:
|
||||
# Other errors are considered command execution errors, and are
|
||||
# recorded as an
|
||||
# recorded as a failed SyncCommandResult with an error message
|
||||
LOG.exception('Command execution error: %s', e)
|
||||
result = SyncCommandResult(command_name, kwargs, False, e)
|
||||
|
||||
LOG.info('Command %(name)s completed: %(result)s',
|
||||
{'name': command_name, 'result': result})
|
||||
self.command_results[result.id] = result
|
||||
return result
|
||||
|
||||
|
@ -12,10 +12,14 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from oslo_log import log
|
||||
|
||||
from ironic_python_agent import errors
|
||||
from ironic_python_agent.extensions import base
|
||||
from ironic_python_agent import hardware
|
||||
|
||||
LOG = log.getLogger()
|
||||
|
||||
|
||||
class CleanExtension(base.BaseAgentExtension):
|
||||
@base.sync_command('get_clean_steps')
|
||||
@ -28,10 +32,13 @@ class CleanExtension(base.BaseAgentExtension):
|
||||
:returns: A list of clean steps with keys step, priority, and
|
||||
reboot_requested
|
||||
"""
|
||||
LOG.debug('Getting clean steps, called with node: %(node)s, '
|
||||
'ports: %(ports)s', {'node': node, 'ports': ports})
|
||||
# Results should be a dict, not a list
|
||||
steps = hardware.dispatch_to_all_managers('get_clean_steps',
|
||||
node, ports)
|
||||
|
||||
LOG.debug('Returning clean steps: %s', steps)
|
||||
return {
|
||||
'clean_steps': steps,
|
||||
'hardware_manager_version': _get_current_clean_version()
|
||||
@ -52,17 +59,23 @@ class CleanExtension(base.BaseAgentExtension):
|
||||
the step returns.
|
||||
"""
|
||||
# Ensure the agent is still the same version, or raise an exception
|
||||
LOG.info('Executing clean step %s', step)
|
||||
_check_clean_version(clean_version)
|
||||
|
||||
if 'step' not in step:
|
||||
raise ValueError('Malformed clean_step, no "step" key: %s'.format(
|
||||
step))
|
||||
msg = 'Malformed clean_step, no "step" key: %s' % step
|
||||
LOG.error(msg)
|
||||
raise ValueError(msg)
|
||||
try:
|
||||
result = hardware.dispatch_to_managers(step['step'], node, ports)
|
||||
except Exception as e:
|
||||
raise errors.CleaningError(
|
||||
'Error performing clean_step %(step)s: %(err)s' %
|
||||
{'step': step['step'], 'err': e})
|
||||
msg = ('Error performing clean_step %(step)s: %(err)s' %
|
||||
{'step': step['step'], 'err': e})
|
||||
LOG.exception(msg)
|
||||
raise errors.CleaningError(msg)
|
||||
|
||||
LOG.info('Clean step completed: %(step)s, result: %(result)s',
|
||||
{'step': step, 'result': result})
|
||||
# Return the step that was executed so we can dispatch
|
||||
# to the appropriate Ironic interface
|
||||
return {
|
||||
@ -78,6 +91,9 @@ def _check_clean_version(clean_version=None):
|
||||
return
|
||||
agent_version = _get_current_clean_version()
|
||||
if clean_version != agent_version:
|
||||
LOG.warning('Mismatched clean versions. Agent version: %(agent), '
|
||||
'node version: %(node)', {'agent': agent_version,
|
||||
'node': clean_version})
|
||||
raise errors.CleanVersionMismatch(agent_version=agent_version,
|
||||
node_version=clean_version)
|
||||
|
||||
|
@ -207,26 +207,34 @@ class StandbyExtension(base.BaseAgentExtension):
|
||||
|
||||
@base.async_command('cache_image', _validate_image_info)
|
||||
def cache_image(self, image_info=None, force=False):
|
||||
LOG.debug('Caching image %s', image_info['id'])
|
||||
device = hardware.dispatch_to_managers('get_os_install_device')
|
||||
|
||||
result_msg = 'image ({0}) already present on device {1}'
|
||||
|
||||
if self.cached_image_id != image_info['id'] or force:
|
||||
LOG.debug('Already had %s cached, overwriting',
|
||||
self.cached_image_id)
|
||||
_download_image(image_info)
|
||||
_write_image(image_info, device)
|
||||
self.cached_image_id = image_info['id']
|
||||
result_msg = 'image ({0}) cached to device {1}'
|
||||
|
||||
return result_msg.format(image_info['id'], device)
|
||||
msg = result_msg.format(image_info['id'], device)
|
||||
LOG.info(msg)
|
||||
return msg
|
||||
|
||||
@base.async_command('prepare_image', _validate_image_info)
|
||||
def prepare_image(self,
|
||||
image_info=None,
|
||||
configdrive=None):
|
||||
LOG.debug('Preparing image %s', image_info['id'])
|
||||
device = hardware.dispatch_to_managers('get_os_install_device')
|
||||
|
||||
# don't write image again if already cached
|
||||
if self.cached_image_id != image_info['id']:
|
||||
LOG.debug('Already had %s cached, overwriting',
|
||||
self.cached_image_id)
|
||||
_download_image(image_info)
|
||||
_write_image(image_info, device)
|
||||
self.cached_image_id = image_info['id']
|
||||
@ -234,8 +242,10 @@ class StandbyExtension(base.BaseAgentExtension):
|
||||
if configdrive is not None:
|
||||
_write_configdrive_to_partition(configdrive, device)
|
||||
|
||||
return 'image ({0}) written to device {1}'.format(image_info['id'],
|
||||
device)
|
||||
msg = ('image ({0}) written to device {1}'.format(
|
||||
image_info['id'], device))
|
||||
LOG.info(msg)
|
||||
return msg
|
||||
|
||||
def _run_shutdown_script(self, parameter):
|
||||
script = _path_to_script('shell/shutdown.sh')
|
||||
|
@ -614,6 +614,11 @@ def dispatch_to_all_managers(method, *args, **kwargs):
|
||||
LOG.debug('HardwareManager {0} does not support {1}'
|
||||
.format(manager, method))
|
||||
continue
|
||||
except Exception as e:
|
||||
LOG.exception('Unexpected error dispatching %(method)s to '
|
||||
'manager %(manager)s: %(e)s',
|
||||
{'method': method, 'manager': manager, 'e': e})
|
||||
raise
|
||||
responses[manager.__class__.__name__] = response
|
||||
else:
|
||||
LOG.debug('HardwareManager {0} does not have method {1}'
|
||||
@ -650,6 +655,11 @@ def dispatch_to_managers(method, *args, **kwargs):
|
||||
except(errors.IncompatibleHardwareMethodError):
|
||||
LOG.debug('HardwareManager {0} does not support {1}'
|
||||
.format(manager, method))
|
||||
except Exception as e:
|
||||
LOG.exception('Unexpected error dispatching %(method)s to '
|
||||
'manager %(manager)s: %(e)s',
|
||||
{'method': method, 'manager': manager, 'e': e})
|
||||
raise
|
||||
else:
|
||||
LOG.debug('HardwareManager {0} does not have method {1}'
|
||||
.format(manager, method))
|
||||
|
Loading…
x
Reference in New Issue
Block a user