From 4829df2966190595feb69623792678816bdcd6a2 Mon Sep 17 00:00:00 2001 From: Julia Kreger Date: Mon, 30 Mar 2020 09:47:23 -0700 Subject: [PATCH] Extend install_bootloader command timeout Adds a new command_timeout_factor argument to the agent client _command method with a default value of 1, and sets the install_bootloader command to send a value of 2 to extend the timeout factor to give the agent time to complete its work before giving up on installing the bootloader and automatically retrying. Change-Id: Iab72903c3d30b0ae5b80a5115fc635179869eeee Story: 2007483 Task: 39201 --- ironic/conf/agent.py | 6 ++++- ironic/drivers/modules/agent_client.py | 27 ++++++++++++++++--- .../unit/drivers/modules/test_agent_client.py | 3 ++- ...l-bootloader-timeout-8fce9590bf405cdf.yaml | 8 ++++++ 4 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 releasenotes/notes/extends-install-bootloader-timeout-8fce9590bf405cdf.yaml diff --git a/ironic/conf/agent.py b/ironic/conf/agent.py index 0e4be55674..f0cb7c8208 100644 --- a/ironic/conf/agent.py +++ b/ironic/conf/agent.py @@ -104,7 +104,11 @@ opts = [ 'service.')), cfg.IntOpt('command_timeout', default=60, - help=_('Timeout (in seconds) for IPA commands.')), + help=_('Timeout (in seconds) for IPA commands. ' + 'Please note, the bootloader installation command ' + 'to the agent is permitted a timeout of twice the ' + 'value set here as these are IO heavy operations ' + 'depending on the configuration of the instance.')), cfg.IntOpt('max_command_attempts', default=3, help=_('This is the maximum number of attempts that will be ' diff --git a/ironic/drivers/modules/agent_client.py b/ironic/drivers/modules/agent_client.py index 3b26bda23e..2644fa46d2 100644 --- a/ironic/drivers/modules/agent_client.py +++ b/ironic/drivers/modules/agent_client.py @@ -61,7 +61,8 @@ class AgentClient(object): retry_on_exception=( lambda e: isinstance(e, exception.AgentConnectionFailed)), stop_max_attempt_number=CONF.agent.max_command_attempts) - def _command(self, node, method, params, wait=False): + def _command(self, node, method, params, wait=False, + command_timeout_factor=1): """Sends command to agent. :param node: A Node object. @@ -71,6 +72,13 @@ class AgentClient(object): body. :param wait: True to wait for the command to finish executing, False otherwise. + :param command_timeout_factor: An integer, default 1, by which to + multiply the [agent]command_timeout + value. This is intended for use with + extremely long running commands to + the agent ramdisk where a general + timeout value should not be extended + in all cases. :raises: IronicException when failed to issue the request or there was a malformed response from the agent. :raises: AgentAPIError when agent failed to execute specified command. @@ -89,8 +97,9 @@ class AgentClient(object): {'node': node.uuid, 'method': method}) try: - response = self.session.post(url, params=request_params, data=body, - timeout=CONF.agent.command_timeout) + response = self.session.post( + url, params=request_params, data=body, + timeout=CONF.agent.command_timeout * command_timeout_factor) except (requests.ConnectionError, requests.Timeout) as e: msg = (_('Failed to connect to the agent running on node %(node)s ' 'for invoking command %(method)s. Error: %(error)s') % @@ -271,10 +280,20 @@ class AgentClient(object): params = {'root_uuid': root_uuid, 'efi_system_part_uuid': efi_system_part_uuid, 'prep_boot_part_uuid': prep_boot_part_uuid} + + # NOTE(TheJulia): This command explicitly sends a larger timeout + # factor to the _command call such that the agent ramdisk has enough + # time to perform its work. + # TODO(TheJulia): We should likely split install_bootloader into many + # commands at some point, even though that would not be backwards + # compatible. We could at least begin to delineate the commands apart + # over the next cycle or two so we don't need a command timeout + # extension factor. return self._command(node=node, method='image.install_bootloader', params=params, - wait=True) + wait=True, + command_timeout_factor=2) @METRICS.timer('AgentClient.get_clean_steps') def get_clean_steps(self, node, ports): diff --git a/ironic/tests/unit/drivers/modules/test_agent_client.py b/ironic/tests/unit/drivers/modules/test_agent_client.py index 953c7d9ec9..790425e1e4 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_client.py +++ b/ironic/tests/unit/drivers/modules/test_agent_client.py @@ -282,7 +282,8 @@ class TestAgentClient(base.TestCase): self.node, root_uuid, efi_system_part_uuid=efi_system_part_uuid, prep_boot_part_uuid=prep_boot_part_uuid) self.client._command.assert_called_once_with( - node=self.node, method='image.install_bootloader', params=params, + command_timeout_factor=2, node=self.node, + method='image.install_bootloader', params=params, wait=True) def test_install_bootloader(self): diff --git a/releasenotes/notes/extends-install-bootloader-timeout-8fce9590bf405cdf.yaml b/releasenotes/notes/extends-install-bootloader-timeout-8fce9590bf405cdf.yaml new file mode 100644 index 0000000000..9adf46510e --- /dev/null +++ b/releasenotes/notes/extends-install-bootloader-timeout-8fce9590bf405cdf.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes an agent command issue in the bootloader installation process that + can present itself as a connection timeout under heavy IO load conditions. + Now installation commands have an internal timeout which is double the + conductor wide ``[agent]command_timeout``. For more information, see + bug `2007483 `_.