diff --git a/ironic/api/controllers/v1/ramdisk.py b/ironic/api/controllers/v1/ramdisk.py index d29f2ace04..cab6ef28cb 100644 --- a/ironic/api/controllers/v1/ramdisk.py +++ b/ironic/api/controllers/v1/ramdisk.py @@ -184,6 +184,19 @@ class HeartbeatController(rest.RestController): policy.authorize('baremetal:node:ipa_heartbeat', cdict, cdict) rpc_node = api_utils.get_rpc_node_with_suffix(node_ident) + dii = rpc_node['driver_internal_info'] + agent_url = dii.get('agent_url') + # If we have an agent_url on file, and we get something different + # we should fail because this is unexpected behavior of the agent. + if (agent_url is not None + and agent_url != callback_url): + LOG.error('Received heartbeat for node %(node)s with ' + 'callback URL %(url)s. This is not expected, ' + 'and the heartbeat will not be processed.', + {'node': rpc_node.uuid, 'url': callback_url}) + raise exception.Invalid( + _('Detected change in ramdisk provided ' + '"callback_url"')) try: topic = api.request.rpcapi.get_topic_for(rpc_node) diff --git a/ironic/conductor/manager.py b/ironic/conductor/manager.py index 15e79dd78b..0e4cbb28e7 100644 --- a/ironic/conductor/manager.py +++ b/ironic/conductor/manager.py @@ -598,6 +598,8 @@ class ConductorManager(base_manager.BaseConductorManager): node_id, purpose='node rescue') as task: node = task.node + # Record of any pre-existing agent_url should be removed. + utils.remove_agent_url(node) if node.maintenance: raise exception.NodeInMaintenance(op=_('rescuing'), node=node.uuid) @@ -697,6 +699,9 @@ class ConductorManager(base_manager.BaseConductorManager): with task_manager.acquire(context, node_id, purpose='node unrescue') as task: node = task.node + # Record of any pre-existing agent_url should be removed, + # Not that there should be. + utils.remove_agent_url(node) if node.maintenance: raise exception.NodeInMaintenance(op=_('unrescuing'), node=node.uuid) @@ -776,6 +781,7 @@ class ConductorManager(base_manager.BaseConductorManager): info_message = _('Rescue operation aborted for node %s.') % node.uuid last_error = _('By request, the rescue operation was aborted.') node.refresh() + utils.remove_agent_url(node) node.last_error = last_error node.save() LOG.info(info_message) @@ -819,6 +825,8 @@ class ConductorManager(base_manager.BaseConductorManager): with task_manager.acquire(context, node_id, shared=False, purpose='node deployment') as task: node = task.node + # Record of any pre-existing agent_url should be removed. + utils.remove_agent_url(node) if node.maintenance: raise exception.NodeInMaintenance(op=_('provisioning'), node=node.uuid) @@ -972,6 +980,8 @@ class ConductorManager(base_manager.BaseConductorManager): with task_manager.acquire(context, node_id, shared=False, purpose='node tear down') as task: + # Record of any pre-existing agent_url should be removed. + utils.remove_agent_url(task.node) if task.node.protected: raise exception.NodeProtected(node=task.node.uuid) @@ -1168,7 +1178,8 @@ class ConductorManager(base_manager.BaseConductorManager): with task_manager.acquire(context, node_id, shared=False, purpose='node manual cleaning') as task: node = task.node - + # Record of any pre-existing agent_url should be removed. + utils.remove_agent_url(node) if node.maintenance: raise exception.NodeInMaintenance(op=_('cleaning'), node=node.uuid) @@ -1473,6 +1484,8 @@ class ConductorManager(base_manager.BaseConductorManager): driver_internal_info.pop('clean_step_index', None) driver_internal_info.pop('cleaning_reboot', None) driver_internal_info.pop('cleaning_polling', None) + # Remove agent_url + driver_internal_info.pop('agent_url', None) node.driver_internal_info = driver_internal_info node.save() try: @@ -1562,6 +1575,7 @@ class ConductorManager(base_manager.BaseConductorManager): info.pop('cleaning_reboot', None) info.pop('cleaning_polling', None) info.pop('skip_current_clean_step', None) + info.pop('agent_url', None) node.driver_internal_info = info node.save() LOG.info(info_message) @@ -3979,6 +3993,8 @@ def _do_next_deploy_step(task, step_index, conductor_id): driver_internal_info.pop('deploy_step_index', None) driver_internal_info.pop('deployment_reboot', None) driver_internal_info.pop('deployment_polling', None) + # Remove the agent_url cached from the deployment. + driver_internal_info.pop('agent_url', None) node.driver_internal_info = driver_internal_info node.save() @@ -4202,6 +4218,9 @@ def _do_inspect_hardware(task): log_func("Failed to inspect node %(node)s: %(err)s", {'node': node.uuid, 'err': e}) + # Remove agent_url, while not strictly needed for the inspection path, + # lets just remove it out of good practice. + utils.remove_agent_url(node) try: new_state = task.driver.inspect.inspect_hardware(task) except exception.IronicException as e: diff --git a/ironic/conductor/utils.py b/ironic/conductor/utils.py index ebd75238f7..2ba02f89e6 100644 --- a/ironic/conductor/utils.py +++ b/ironic/conductor/utils.py @@ -417,6 +417,9 @@ def cleaning_error_handler(task, msg, tear_down_cleaning=True, info.pop('cleaning_reboot', None) info.pop('cleaning_polling', None) info.pop('skip_current_clean_step', None) + # We don't need to keep the old agent URL + # as it should change upon the next cleaning attempt. + info.pop('agent_url', None) node.driver_internal_info = info # For manual cleaning, the target provision state is MANAGEABLE, whereas # for automated cleaning, it is AVAILABLE. @@ -477,6 +480,9 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False, info.pop('deployment_reboot', None) info.pop('deployment_polling', None) info.pop('skip_current_deploy_step', None) + # Remove agent_url since it will be re-asserted + # upon the next deployment attempt. + info.pop('agent_url', None) node.driver_internal_info = info if cleanup_err: @@ -521,6 +527,7 @@ def rescuing_error_handler(task, msg, set_fail_state=True): try: node_power_action(task, states.POWER_OFF) task.driver.rescue.clean_up(task) + remove_agent_url(node) node.last_error = msg except exception.IronicException as e: node.last_error = (_('Rescue operation was unsuccessful, clean up ' @@ -535,6 +542,7 @@ def rescuing_error_handler(task, msg, set_fail_state=True): LOG.exception('Rescue failed for node %(node)s, an exception was ' 'encountered while aborting.', {'node': node.uuid}) finally: + remove_agent_url(node) node.save() if set_fail_state: @@ -914,3 +922,10 @@ def is_fast_track(task): task.node.driver_internal_info.get('agent_last_heartbeat'), CONF.deploy.fast_track_timeout) and task.driver.power.get_power_state(task) == states.POWER_ON) + + +def remove_agent_url(node): + """Helper to remove the agent_url record.""" + info = node.driver_internal_info + info.pop('agent_url', None) + node.driver_internal_info = info diff --git a/ironic/tests/unit/api/controllers/v1/test_ramdisk.py b/ironic/tests/unit/api/controllers/v1/test_ramdisk.py index 43162adc1d..0bdd9dfbeb 100644 --- a/ironic/tests/unit/api/controllers/v1/test_ramdisk.py +++ b/ironic/tests/unit/api/controllers/v1/test_ramdisk.py @@ -255,3 +255,15 @@ class TestHeartbeat(test_api_base.BaseApiTest): headers={api_base.Version.string: '1.35'}, expect_errors=True) self.assertEqual(http_client.BAD_REQUEST, response.status_int) + + @mock.patch.object(rpcapi.ConductorAPI, 'heartbeat', autospec=True) + def test_heartbeat_rejects_different_callback_url(self, mock_heartbeat): + node = obj_utils.create_test_node( + self.context, + driver_internal_info={'agent_url': 'url'}) + response = self.post_json( + '/heartbeat/%s' % node.uuid, + {'callback_url': 'url2'}, + headers={api_base.Version.string: str(api_v1.max_version())}, + expect_errors=True) + self.assertEqual(http_client.BAD_REQUEST, response.status_int) diff --git a/ironic/tests/unit/conductor/test_manager.py b/ironic/tests/unit/conductor/test_manager.py index de2bc5f720..5fb5c06d6b 100644 --- a/ironic/tests/unit/conductor/test_manager.py +++ b/ironic/tests/unit/conductor/test_manager.py @@ -1352,9 +1352,11 @@ class ServiceDoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, autospec=True) as mock_spawn: mock_spawn.return_value = thread - node = obj_utils.create_test_node(self.context, - driver='fake-hardware', - provision_state=states.AVAILABLE) + node = obj_utils.create_test_node( + self.context, + driver='fake-hardware', + provision_state=states.AVAILABLE, + driver_internal_info={'agent_url': 'url'}) self.service.do_node_deploy(self.context, node.uuid) self._stop_service() @@ -1369,6 +1371,7 @@ class ServiceDoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, mock.ANY, None) mock_iwdi.assert_called_once_with(self.context, node.instance_info) self.assertFalse(node.driver_internal_info['is_whole_disk_image']) + self.assertNotIn('agent_url', node.driver_internal_info) @mock.patch('ironic.drivers.modules.fake.FakeDeploy.deploy') def test_do_node_deploy_rebuild_active_state_old(self, mock_deploy, @@ -2673,7 +2676,8 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, def test__do_next_deploy_step_all(self, mock_execute): # Run all steps from start to finish (all synchronous) driver_internal_info = {'deploy_step_index': None, - 'deploy_steps': self.deploy_steps} + 'deploy_steps': self.deploy_steps, + 'agent_url': 'url'} self._start_service() node = obj_utils.create_test_node( self.context, driver='fake-hardware', @@ -2695,6 +2699,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin, self.assertIsNone(node.driver_internal_info['deploy_steps']) mock_execute.assert_has_calls = [mock.call(self.deploy_steps[0]), mock.call(self.deploy_steps[1])] + self.assertNotIn('agent_url', node.driver_internal_info) @mock.patch.object(conductor_utils, 'LOG', autospec=True) @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step', @@ -3869,7 +3874,8 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.context, driver='fake-hardware', provision_state=states.CLEANING, target_provision_state=states.AVAILABLE, - last_error=None) + last_error=None, + driver_internal_info={'agent_url': 'url'}) with task_manager.acquire( self.context, node.uuid, shared=False) as task: self.service._do_node_clean(task) @@ -3879,6 +3885,7 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): # Assert that the node was cleaned self.assertTrue(mock_validate.called) self.assertIn('clean_steps', node.driver_internal_info) + self.assertNotIn('agent_url', node.driver_internal_info) @mock.patch('ironic.drivers.modules.fake.FakePower.validate', autospec=True) @@ -4614,7 +4621,8 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, task = self._create_task( node_attrs=dict(driver='fake-hardware', provision_state=states.ACTIVE, - instance_info={})) + instance_info={}, + driver_internal_info={'agent_url': 'url'})) mock_acquire.side_effect = self._get_acquire_side_effect(task) self.service.do_node_rescue(self.context, task.node.uuid, "password") @@ -4624,6 +4632,7 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, call_args=(self.service._do_node_rescue, task), err_handler=conductor_utils.spawn_rescue_error_handler) self.assertIn('rescue_password', task.node.instance_info) + self.assertNotIn('agent_url', task.node.driver_internal_info) def test_do_node_rescue_invalid_state(self): self._start_service() @@ -4755,9 +4764,12 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, self._start_service() task = self._create_task( node_attrs=dict(driver='fake-hardware', - provision_state=states.RESCUE)) + provision_state=states.RESCUE, + driver_internal_info={'agent_url': 'url'})) mock_acquire.side_effect = self._get_acquire_side_effect(task) self.service.do_node_unrescue(self.context, task.node.uuid) + task.node.refresh() + self.assertNotIn('agent_url', task.node.driver_internal_info) task.process_event.assert_called_once_with( 'unrescue', callback=self.service._spawn_worker, @@ -4891,12 +4903,14 @@ class DoNodeRescueTestCase(mgr_utils.CommonMixIn, mgr_utils.ServiceSetUpMixin, node = obj_utils.create_test_node( self.context, driver='fake-hardware', provision_state=states.RESCUEFAIL, - target_provision_state=states.RESCUE) + target_provision_state=states.RESCUE, + driver_internal_info={'agent_url': 'url'}) with task_manager.acquire(self.context, node.uuid) as task: self.service._do_node_rescue_abort(task) clean_up_mock.assert_called_once_with(task.driver.rescue, task) self.assertIsNotNone(task.node.last_error) self.assertFalse(task.node.maintenance) + self.assertNotIn('agent_url', task.node.driver_internal_info) @mock.patch.object(fake.FakeRescue, 'clean_up', autospec=True) def test__do_node_rescue_abort_clean_up_fail(self, clean_up_mock): @@ -7887,8 +7901,10 @@ class NodeInspectHardware(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): @mock.patch('ironic.drivers.modules.fake.FakeInspect.inspect_hardware') def test_inspect_hardware_ok(self, mock_inspect): self._start_service() - node = obj_utils.create_test_node(self.context, driver='fake-hardware', - provision_state=states.INSPECTING) + node = obj_utils.create_test_node( + self.context, driver='fake-hardware', + provision_state=states.INSPECTING, + driver_internal_info={'agent_url': 'url'}) task = task_manager.TaskManager(self.context, node.uuid) mock_inspect.return_value = states.MANAGEABLE manager._do_inspect_hardware(task) @@ -7897,6 +7913,8 @@ class NodeInspectHardware(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase): self.assertEqual(states.NOSTATE, node.target_provision_state) self.assertIsNone(node.last_error) mock_inspect.assert_called_once_with(mock.ANY) + task.node.refresh() + self.assertNotIn('agent_url', task.node.driver_internal_info) @mock.patch('ironic.drivers.modules.fake.FakeInspect.inspect_hardware') def test_inspect_hardware_return_inspecting(self, mock_inspect): diff --git a/ironic/tests/unit/conductor/test_utils.py b/ironic/tests/unit/conductor/test_utils.py index 1526b2ec98..86d51b7055 100644 --- a/ironic/tests/unit/conductor/test_utils.py +++ b/ironic/tests/unit/conductor/test_utils.py @@ -920,6 +920,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): info['deployment_reboot'] = True info['deployment_polling'] = True info['skip_current_deploy_step'] = True + info['agent_url'] = 'url' conductor_utils.deploying_error_handler(self.task, self.logmsg, self.errmsg) @@ -932,6 +933,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase): self.assertNotIn('deployment_polling', self.node.driver_internal_info) self.assertNotIn('skip_current_deploy_step', self.node.driver_internal_info) + self.assertNotIn('agent_url', self.node.driver_internal_info) self.task.process_event.assert_called_once_with('fail') def _test_deploying_error_handler_cleanup(self, exc, expected_str): @@ -1059,7 +1061,8 @@ class ErrorHandlersTestCase(tests_base.TestCase): 'cleaning_reboot': True, 'cleaning_polling': True, 'skip_current_clean_step': True, - 'clean_step_index': 0} + 'clean_step_index': 0, + 'agent_url': 'url'} msg = 'error bar' conductor_utils.cleaning_error_handler(self.task, msg) self.node.save.assert_called_once_with() @@ -1080,6 +1083,7 @@ class ErrorHandlersTestCase(tests_base.TestCase): else: self.task.process_event.assert_called_once_with('fail', target_state=None) + self.assertNotIn('agent_url', self.node.driver_internal_info) def test_cleaning_error_handler(self): self._test_cleaning_error_handler() @@ -1254,12 +1258,14 @@ class ErrorHandlersTestCase(tests_base.TestCase): def _test_rescuing_error_handler(self, node_power_mock, set_state=True): self.node.provision_state = states.RESCUEWAIT + self.node.driver_internal_info.update({'agent_url': 'url'}) conductor_utils.rescuing_error_handler(self.task, 'some exception for node', set_fail_state=set_state) node_power_mock.assert_called_once_with(mock.ANY, states.POWER_OFF) self.task.driver.rescue.clean_up.assert_called_once_with(self.task) self.node.save.assert_called_once_with() + self.assertNotIn('agent_url', self.node.driver_internal_info) if set_state: self.assertTrue(self.task.process_event.called) else: diff --git a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py index afb14c02f6..6710b97293 100644 --- a/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py +++ b/ironic/tests/unit/drivers/modules/test_agent_base_vendor.py @@ -269,9 +269,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest): shared=True) as task: self.deploy.heartbeat(task, agent_url, '3.2.0') self.assertFalse(task.shared) - self.assertEqual( - agent_url, - task.node.driver_internal_info['agent_url']) + self.assertIsNone( + task.node.driver_internal_info.get('agent_url', None)) self.assertEqual( '3.2.0', task.node.driver_internal_info['agent_version']) diff --git a/releasenotes/notes/prevent-callback-url-from-being-updated-41d50b20fb236e82.yaml b/releasenotes/notes/prevent-callback-url-from-being-updated-41d50b20fb236e82.yaml new file mode 100644 index 0000000000..da21ea79e1 --- /dev/null +++ b/releasenotes/notes/prevent-callback-url-from-being-updated-41d50b20fb236e82.yaml @@ -0,0 +1,10 @@ +--- +security: + - | + Prevents additional updates of an agent ``callback_url`` through the agent + heartbeat ``/v1/heartbeat/`` endpoint as the ``callback_url`` + should remain stable through the cleaning, provisioning, or rescue + processes. Should anything such as an unexpected agent reboot cause the + ``callback_url``, heartbeat operations will now be ignored. + More information can be found at + `story 2006773 `_.