Handle agent still doing the prior command

The agent command exec model is based upon an incoming
heartbeat, however heartbeats are independent and
commands can take a long time. For example, software RAID
setup in CI can encounter this.

From an IPA log:

[-] Picked root device /dev/md0 for node c6ca0af2-baec-40d6-879d-cbb5c751aafb
    based on root device hints {'name': '/dev/md0'}
[-] Attempting to download image from http://199.204.45.248:3928/agent_images/
    c6ca0af2-baec-40d6-879d-cbb5c751aafb
[-] Executing command: standby.get_partition_uuids with args: {} execute_command
    /usr/local/lib/python3.6/site-packages/ironic_python_agent/extensions/base.py:255
[-] Tried to execute standby.get_partition_uuids, agent is still executing Command name:
    execute_deploy_step, params: {'step': {'interface': 'deploy', 'step': 'write_image',
    'args': {'image_info': {'id': 'cb9e199a-af1b-4a6f-b00e-f284008b8046',
    'urls': ['http://199.204.45.248:3928/agent_images/c6ca0af2-baec-40d6-879d-cbb5c751aafb'],
    'disk_format': 'raw', 'container_format': 'bare', 'stream_raw_images': True, 'os_hash_algo':
    'sha512', 'os_hash_value':<trimed>

This was with code built on master, using master images.
Inside the conductor log, it notes that it is likely an out
of date agent because only AgentAPIError is evaluated,
however any API error is evaluated this way. In reality, we need
to explicitly flag *when* we have an error that is because
we've tried to soon as something is already being worked upon.

The result, is to evaluate and return an exception indicating work
is already in flight.

Update - It looks like, the original fix to prevent busy agent
recognition did not fully detect all cases as getting steps is a
command which can
get skipped by accident with a busy agent, under certain circumstances.
Change I5d86878b5ed6142ed2630adee78c0867c49b663f in ironic-python-agent
also changed the string that was being checked for the previous
handling, where we really should have just made the string we were
checking lower case in ironic. Oh well! This should fix things
right up.

Story: 2008167
Task: 41175
Change-Id: Ia169640b7084d17d26f22e457c7af512db6d21d6
(cherry picked from commit 545dc2106b)
This commit is contained in:
Julia Kreger 2020-10-27 06:53:42 -07:00
parent 90da180a17
commit d1ffc6a557
10 changed files with 188 additions and 8 deletions

View File

@ -813,3 +813,8 @@ class UnknownAttribute(ClientSideError):
else:
self.fieldname = name
super(UnknownAttribute, self).__init__(self.msg)
class AgentInProgress(IronicException):
_msg_fmt = _('Node %(node)s command "%(command)s" failed. Agent is '
'presently executing a command. Error %(error)s')

View File

@ -177,6 +177,17 @@ def do_next_clean_step(task, step_index):
else None)
task.process_event('wait', target_state=target_state)
return
if isinstance(e, exception.AgentInProgress):
LOG.info('Conductor attempted to process clean step for '
'node %(node)s. Agent indicated it is presently '
'executing a command. Error: %(error)s',
{'node': task.node.uuid,
'error': e})
driver_internal_info['skip_current_clean_step'] = False
node.driver_internal_info = driver_internal_info
target_state = states.MANAGEABLE if manual_clean else None
task.process_event('wait', target_state=target_state)
return
msg = (_('Node %(node)s failed step %(step)s: '
'%(exc)s') %

View File

@ -257,6 +257,16 @@ def do_next_deploy_step(task, step_index):
node.driver_internal_info = driver_internal_info
task.process_event('wait')
return
if isinstance(e, exception.AgentInProgress):
LOG.info('Conductor attempted to process deploy step for '
'node %(node)s. Agent indicated it is presently '
'executing a command. Error: %(error)s',
{'node': task.node.uuid,
'error': e})
driver_internal_info['skip_current_deploy_step'] = False
node.driver_internal_info = driver_internal_info
task.process_event('wait')
return
log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
'%(err)s' %
{'node': node.uuid, 'step': node.deploy_step, 'err': e})

View File

@ -902,11 +902,11 @@ class AgentDeployMixin(HeartbeatMixin, AgentOobStepsMixin):
call = getattr(self._client, 'get_%s_steps' % step_type)
try:
agent_result = call(node, task.ports).get('command_result', {})
except exception.AgentAPIError as exc:
if 'agent is busy' in str(exc):
LOG.debug('Agent is busy with a command, will refresh steps '
'on the next heartbeat')
return
except exception.AgentInProgress as exc:
LOG.debug('Agent for node %(node)s is busy with a command, '
'will refresh steps on the next heartbeat.',
{'node': task.node.uuid})
return
# TODO(dtantsur): change to just 'raise'
if step_type == 'clean':

View File

@ -153,6 +153,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command result from agent, see
get_commands_status for a sample.
"""
@ -223,6 +225,17 @@ class AgentClient(object):
'Expected 2xx HTTP status code, got %(code)d.',
{'method': method, 'node': node.uuid,
'code': response.status_code})
if (response.status_code == http_client.CONFLICT
or 'agent is busy' in faultstring.lower()):
# HTTP 409 check as an explicit check of if the agent
# is already busy.
# NOTE(TheJulia): The agent sends upper case A as of
# late victoria, but lower case the entire message
# for compatability with pre-late victoria agents
# which returns HTTP 409.
raise exception.AgentInProgress(node=node.uuid,
command=method,
error=faultstring)
raise exception.AgentAPIError(node=node.uuid,
status=response.status_code,
error=faultstring)
@ -337,6 +350,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command status from agent.
See :func:`get_commands_status` for a command result sample.
"""
@ -370,6 +385,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@ -400,6 +417,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@ -456,6 +475,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
The value of key command_result is in the form of:
@ -487,6 +508,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
The value of key command_result is in the form of:
@ -550,6 +573,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
The value of key command_result is in the form of:
@ -581,6 +606,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
"""
@ -597,6 +624,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@ -612,6 +641,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@ -627,6 +658,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@ -643,6 +676,8 @@ class AgentClient(object):
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:returns: A dict containing command response from agent.
See :func:`get_commands_status` for a command result sample.
"""
@ -660,6 +695,8 @@ class AgentClient(object):
to issue the request, or there was a malformed response from
the agent.
:raises: AgentAPIError when agent failed to execute specified command.
:raises: AgentInProgress when the command fails to execute as the agent
is presently executing the prior command.
:raises: InstanceRescueFailure when the agent ramdisk is too old
to support transmission of the rescue password.
:returns: A dict containing command response from agent.

View File

@ -781,6 +781,37 @@ class DoNodeCleanTestCase(db_base.DbTestCase):
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.clean_steps[0])
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
autospec=True)
def test_do_next_clean_step_agent_busy(self, mock_execute):
# When a clean step fails, go to CLEANWAIT
tgt_prov_state = states.MANAGEABLE
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.CLEANING,
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info={'clean_steps': self.clean_steps,
'clean_step_index': None,
'cleaning_reboot': True},
clean_step={})
mock_execute.side_effect = exception.AgentInProgress(
reason='still meowing')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
cleaning.do_next_clean_step(task, 0)
node.refresh()
# Make sure we go to CLEANWAIT
self.assertEqual(states.CLEANWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertEqual(self.clean_steps[0], node.clean_step)
self.assertEqual(0, node.driver_internal_info['clean_step_index'])
self.assertFalse(node.driver_internal_info['skip_current_clean_step'])
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.clean_steps[0])
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
autospec=True)
def test_do_next_clean_step_oob_reboot_last_step(self, mock_execute):

View File

@ -767,7 +767,42 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
self._stop_service()
node.refresh()
# Make sure we go to CLEANWAIT
# Make sure we go to DEPLOYWAIT
self.assertEqual(states.DEPLOYWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertEqual(self.deploy_steps[0], node.deploy_step)
self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
self.assertFalse(node.driver_internal_info['skip_current_deploy_step'])
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.deploy_steps[0])
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
autospec=True)
def test_do_next_deploy_step_agent_busy(self, mock_execute):
# When a deploy step fails, go to DEPLOYWAIT
tgt_prov_state = states.ACTIVE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.DEPLOYING,
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info={'deploy_steps': self.deploy_steps,
'deploy_step_index': None,
'deployment_reboot': True},
clean_step={})
mock_execute.side_effect = exception.AgentInProgress(
reason='meow')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
deployments.do_next_deploy_step(task, 0)
self._stop_service()
node.refresh()
# Make sure we go to DEPLOYWAIT
self.assertEqual(states.DEPLOYWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertEqual(self.deploy_steps[0], node.deploy_step)

View File

@ -2181,8 +2181,8 @@ class TestRefreshCleanSteps(AgentDeployMixinBaseTest):
@mock.patch.object(agent_client.AgentClient, 'get_deploy_steps',
autospec=True)
def test_refresh_steps_busy(self, client_mock, log_mock):
client_mock.side_effect = exception.AgentAPIError(
node="node", status="500", error='agent is busy')
client_mock.side_effect = exception.AgentInProgress(
node="node", error='Agent is busy : maximum meowing')
with task_manager.acquire(
self.context, self.node.uuid, shared=False) as task:

View File

@ -237,6 +237,51 @@ class TestAgentClient(base.TestCase):
timeout=60,
verify=True)
def test__command_error_code_agent_busy(self):
# Victoria and previous busy status check.
response_text = {"faultstring": "Agent is busy - meowing"}
self.client.session.post.return_value = MockResponse(
response_text, status_code=http_client.BAD_REQUEST)
method = 'standby.run_image'
image_info = {'image_id': 'test_image'}
params = {'image_info': image_info}
url = self.client._get_command_url(self.node)
body = self.client._get_command_body(method, params)
self.assertRaises(exception.AgentInProgress,
self.client._command,
self.node, method, params)
self.client.session.post.assert_called_once_with(
url,
data=body,
params={'wait': 'false'},
timeout=60,
verify=True)
def test__command_error_code_agent_busy_conflict(self):
# Post Wallaby logic as the IPA return code changed to
# better delineate the state.
response_text = {"faultstring": "lolcat says busy meowing"}
self.client.session.post.return_value = MockResponse(
response_text, status_code=http_client.CONFLICT)
method = 'standby.run_image'
image_info = {'image_id': 'test_image'}
params = {'image_info': image_info}
url = self.client._get_command_url(self.node)
body = self.client._get_command_body(method, params)
self.assertRaises(exception.AgentInProgress,
self.client._command,
self.node, method, params)
self.client.session.post.assert_called_once_with(
url,
data=body,
params={'wait': 'false'},
timeout=60,
verify=True)
@mock.patch('os.path.exists', autospec=True, return_value=True)
def test__command_verify(self, mock_exists):
response_data = {'status': 'ok'}

View File

@ -0,0 +1,6 @@
---
fixes:
- |
Fixes recognition of a busy agent to also handle recognition
during deployment steps by more uniformly detecting and
identifying when the ``ironic-python-agent`` service is busy.