diff --git a/ironic/drivers/modules/pxe_base.py b/ironic/drivers/modules/pxe_base.py index 317b65b855..78d7b59878 100644 --- a/ironic/drivers/modules/pxe_base.py +++ b/ironic/drivers/modules/pxe_base.py @@ -490,6 +490,12 @@ class PXEBaseMixin(object): def _should_retry_boot(node): # NOTE(dtantsur): this assumes IPA, do we need to make it generic? for field in ('agent_last_heartbeat', 'last_power_state_change'): + if node.driver_internal_info.get('agent_secret_token', False): + LOG.debug('Not retrying PXE boot for node %(node)s; an agent ' + 'token has been identified, meaning the agent ' + 'has started.', + {'node': node.uuid}) + return False if manager_utils.value_within_timeout( node.driver_internal_info.get(field), CONF.pxe.boot_retry_timeout): diff --git a/ironic/tests/unit/drivers/modules/test_pxe.py b/ironic/tests/unit/drivers/modules/test_pxe.py index d999a8f7ae..be48f890e7 100644 --- a/ironic/tests/unit/drivers/modules/test_pxe.py +++ b/ironic/tests/unit/drivers/modules/test_pxe.py @@ -1277,6 +1277,18 @@ class PXEBootRetryTestCase(db_base.DbTestCase): mock_boot_dev.assert_called_once_with(task, 'pxe', persistent=False) + def test_check_boot_status_not_retry_with_token(self, mock_power, + mock_boot_dev): + with task_manager.acquire(self.context, self.node.uuid, + shared=True) as task: + task.node.driver_internal_info = { + 'agent_secret_token': 'xyz' + } + task.driver.boot._check_boot_status(task) + self.assertTrue(task.shared) + mock_power.assert_not_called() + mock_boot_dev.assert_not_called() + class iPXEBootRetryTestCase(PXEBootRetryTestCase): diff --git a/releasenotes/notes/prevent-pxe-retry-when-token-exists-a4f38f7da56c1397.yaml b/releasenotes/notes/prevent-pxe-retry-when-token-exists-a4f38f7da56c1397.yaml new file mode 100644 index 0000000000..5db6db6ecb --- /dev/null +++ b/releasenotes/notes/prevent-pxe-retry-when-token-exists-a4f38f7da56c1397.yaml @@ -0,0 +1,7 @@ +--- +fixes: + - | + Fixes a race condition in PXE initialization where logic to retry + what we suspect as potentially failed PXE boot operations was not + consulting if an ``agent token`` had been established, which is the + very first step in agent initialization.