Extend install_bootloader command timeout

Adds a new command_timeout_factor argument to the
agent client _command method with a default value of 1,
and sets the install_bootloader command to send a value of 2
to extend the timeout factor to give the agent time to complete
its work before giving up on installing the bootloader and
automatically retrying.

Change-Id: Iab72903c3d30b0ae5b80a5115fc635179869eeee
Story: 2007483
Task: 39201
This commit is contained in:
Julia Kreger 2020-03-30 09:47:23 -07:00
parent 281634dc4e
commit 4829df2966
4 changed files with 38 additions and 6 deletions

View File

@ -104,7 +104,11 @@ opts = [
'service.')), 'service.')),
cfg.IntOpt('command_timeout', cfg.IntOpt('command_timeout',
default=60, default=60,
help=_('Timeout (in seconds) for IPA commands.')), help=_('Timeout (in seconds) for IPA commands. '
'Please note, the bootloader installation command '
'to the agent is permitted a timeout of twice the '
'value set here as these are IO heavy operations '
'depending on the configuration of the instance.')),
cfg.IntOpt('max_command_attempts', cfg.IntOpt('max_command_attempts',
default=3, default=3,
help=_('This is the maximum number of attempts that will be ' help=_('This is the maximum number of attempts that will be '

View File

@ -61,7 +61,8 @@ class AgentClient(object):
retry_on_exception=( retry_on_exception=(
lambda e: isinstance(e, exception.AgentConnectionFailed)), lambda e: isinstance(e, exception.AgentConnectionFailed)),
stop_max_attempt_number=CONF.agent.max_command_attempts) stop_max_attempt_number=CONF.agent.max_command_attempts)
def _command(self, node, method, params, wait=False): def _command(self, node, method, params, wait=False,
command_timeout_factor=1):
"""Sends command to agent. """Sends command to agent.
:param node: A Node object. :param node: A Node object.
@ -71,6 +72,13 @@ class AgentClient(object):
body. body.
:param wait: True to wait for the command to finish executing, False :param wait: True to wait for the command to finish executing, False
otherwise. otherwise.
:param command_timeout_factor: An integer, default 1, by which to
multiply the [agent]command_timeout
value. This is intended for use with
extremely long running commands to
the agent ramdisk where a general
timeout value should not be extended
in all cases.
:raises: IronicException when failed to issue the request or there was :raises: IronicException when failed to issue the request or there was
a malformed response from the agent. a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command. :raises: AgentAPIError when agent failed to execute specified command.
@ -89,8 +97,9 @@ class AgentClient(object):
{'node': node.uuid, 'method': method}) {'node': node.uuid, 'method': method})
try: try:
response = self.session.post(url, params=request_params, data=body, response = self.session.post(
timeout=CONF.agent.command_timeout) url, params=request_params, data=body,
timeout=CONF.agent.command_timeout * command_timeout_factor)
except (requests.ConnectionError, requests.Timeout) as e: except (requests.ConnectionError, requests.Timeout) as e:
msg = (_('Failed to connect to the agent running on node %(node)s ' msg = (_('Failed to connect to the agent running on node %(node)s '
'for invoking command %(method)s. Error: %(error)s') % 'for invoking command %(method)s. Error: %(error)s') %
@ -271,10 +280,20 @@ class AgentClient(object):
params = {'root_uuid': root_uuid, params = {'root_uuid': root_uuid,
'efi_system_part_uuid': efi_system_part_uuid, 'efi_system_part_uuid': efi_system_part_uuid,
'prep_boot_part_uuid': prep_boot_part_uuid} 'prep_boot_part_uuid': prep_boot_part_uuid}
# NOTE(TheJulia): This command explicitly sends a larger timeout
# factor to the _command call such that the agent ramdisk has enough
# time to perform its work.
# TODO(TheJulia): We should likely split install_bootloader into many
# commands at some point, even though that would not be backwards
# compatible. We could at least begin to delineate the commands apart
# over the next cycle or two so we don't need a command timeout
# extension factor.
return self._command(node=node, return self._command(node=node,
method='image.install_bootloader', method='image.install_bootloader',
params=params, params=params,
wait=True) wait=True,
command_timeout_factor=2)
@METRICS.timer('AgentClient.get_clean_steps') @METRICS.timer('AgentClient.get_clean_steps')
def get_clean_steps(self, node, ports): def get_clean_steps(self, node, ports):

View File

@ -282,7 +282,8 @@ class TestAgentClient(base.TestCase):
self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid, self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid,
prep_boot_part_uuid=prep_boot_part_uuid) prep_boot_part_uuid=prep_boot_part_uuid)
self.client._command.assert_called_once_with( self.client._command.assert_called_once_with(
node=self.node, method='image.install_bootloader', params=params, command_timeout_factor=2, node=self.node,
method='image.install_bootloader', params=params,
wait=True) wait=True)
def test_install_bootloader(self): def test_install_bootloader(self):

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an agent command issue in the bootloader installation process that
can present itself as a connection timeout under heavy IO load conditions.
Now installation commands have an internal timeout which is double the
conductor wide ``[agent]command_timeout``. For more information, see
bug `2007483 <https://storyboard.openstack.org/#!/story/2007483>`_.