Browse Source

Extend install_bootloader command timeout

Adds a new command_timeout_factor argument to the
agent client _command method with a default value of 1,
and sets the install_bootloader command to send a value of 2
to extend the timeout factor to give the agent time to complete
its work before giving up on installing the bootloader and
automatically retrying.

Change-Id: Iab72903c3d30b0ae5b80a5115fc635179869eeee
Story: 2007483
Task: 39201
tags/15.0.0
Julia Kreger 3 months ago
parent
commit
4829df2966
4 changed files with 38 additions and 6 deletions
  1. +5
    -1
      ironic/conf/agent.py
  2. +23
    -4
      ironic/drivers/modules/agent_client.py
  3. +2
    -1
      ironic/tests/unit/drivers/modules/test_agent_client.py
  4. +8
    -0
      releasenotes/notes/extends-install-bootloader-timeout-8fce9590bf405cdf.yaml

+ 5
- 1
ironic/conf/agent.py View File

@@ -104,7 +104,11 @@ opts = [
'service.')),
cfg.IntOpt('command_timeout',
default=60,
help=_('Timeout (in seconds) for IPA commands.')),
help=_('Timeout (in seconds) for IPA commands. '
'Please note, the bootloader installation command '
'to the agent is permitted a timeout of twice the '
'value set here as these are IO heavy operations '
'depending on the configuration of the instance.')),
cfg.IntOpt('max_command_attempts',
default=3,
help=_('This is the maximum number of attempts that will be '


+ 23
- 4
ironic/drivers/modules/agent_client.py View File

@@ -61,7 +61,8 @@ class AgentClient(object):
retry_on_exception=(
lambda e: isinstance(e, exception.AgentConnectionFailed)),
stop_max_attempt_number=CONF.agent.max_command_attempts)
def _command(self, node, method, params, wait=False):
def _command(self, node, method, params, wait=False,
command_timeout_factor=1):
"""Sends command to agent.

:param node: A Node object.
@@ -71,6 +72,13 @@ class AgentClient(object):
body.
:param wait: True to wait for the command to finish executing, False
otherwise.
:param command_timeout_factor: An integer, default 1, by which to
multiply the [agent]command_timeout
value. This is intended for use with
extremely long running commands to
the agent ramdisk where a general
timeout value should not be extended
in all cases.
:raises: IronicException when failed to issue the request or there was
a malformed response from the agent.
:raises: AgentAPIError when agent failed to execute specified command.
@@ -89,8 +97,9 @@ class AgentClient(object):
{'node': node.uuid, 'method': method})

try:
response = self.session.post(url, params=request_params, data=body,
timeout=CONF.agent.command_timeout)
response = self.session.post(
url, params=request_params, data=body,
timeout=CONF.agent.command_timeout * command_timeout_factor)
except (requests.ConnectionError, requests.Timeout) as e:
msg = (_('Failed to connect to the agent running on node %(node)s '
'for invoking command %(method)s. Error: %(error)s') %
@@ -271,10 +280,20 @@ class AgentClient(object):
params = {'root_uuid': root_uuid,
'efi_system_part_uuid': efi_system_part_uuid,
'prep_boot_part_uuid': prep_boot_part_uuid}

# NOTE(TheJulia): This command explicitly sends a larger timeout
# factor to the _command call such that the agent ramdisk has enough
# time to perform its work.
# TODO(TheJulia): We should likely split install_bootloader into many
# commands at some point, even though that would not be backwards
# compatible. We could at least begin to delineate the commands apart
# over the next cycle or two so we don't need a command timeout
# extension factor.
return self._command(node=node,
method='image.install_bootloader',
params=params,
wait=True)
wait=True,
command_timeout_factor=2)

@METRICS.timer('AgentClient.get_clean_steps')
def get_clean_steps(self, node, ports):


+ 2
- 1
ironic/tests/unit/drivers/modules/test_agent_client.py View File

@@ -282,7 +282,8 @@ class TestAgentClient(base.TestCase):
self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid,
prep_boot_part_uuid=prep_boot_part_uuid)
self.client._command.assert_called_once_with(
node=self.node, method='image.install_bootloader', params=params,
command_timeout_factor=2, node=self.node,
method='image.install_bootloader', params=params,
wait=True)

def test_install_bootloader(self):


+ 8
- 0
releasenotes/notes/extends-install-bootloader-timeout-8fce9590bf405cdf.yaml View File

@@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an agent command issue in the bootloader installation process that
can present itself as a connection timeout under heavy IO load conditions.
Now installation commands have an internal timeout which is double the
conductor wide ``[agent]command_timeout``. For more information, see
bug `2007483 <https://storyboard.openstack.org/#!/story/2007483>`_.

Loading…
Cancel
Save