Browse Source

Asynchronous out of band deploy steps fails to execute

Asynchronous out of band steps in a deploy template fails to
execute. This commit fixes that issue. Asynchronous steps can
set 'skip_current_deploy_step' flag to False in
'driver_internal_info' to make sure that upon reboot same step
is re-executed. Also it can set 'deployment_reboot' flag to True
in 'driver_internal_info' to signal that it has rebooted the node.

Co-Authored-By: Mark Goddard <mark@stackhpc.com>
Change-Id: If6217afb5453c311d5ca71ba37458a9b97c18395
Story: 2006342
Task: 36095
(cherry picked from commit 8f907886a1)
changes/52/676152/2
Shivanand Tendulker 1 month ago
parent
commit
ba0e73fa15

+ 26
- 4
ironic/conductor/manager.py View File

@@ -875,8 +875,9 @@ class ConductorManager(base_manager.BaseConductorManager):
875 875
                     action=event, node=task.node.uuid,
876 876
                     state=task.node.provision_state)
877 877
 
878
-    def _get_node_next_deploy_steps(self, task):
879
-        return self._get_node_next_steps(task, 'deploy')
878
+    def _get_node_next_deploy_steps(self, task, skip_current_step=True):
879
+        return self._get_node_next_steps(task, 'deploy',
880
+                                         skip_current_step=skip_current_step)
880 881
 
881 882
     @METRICS.timer('ConductorManager.continue_node_deploy')
882 883
     def continue_node_deploy(self, context, node_id):
@@ -914,7 +915,17 @@ class ConductorManager(base_manager.BaseConductorManager):
914 915
                      'state': node.provision_state,
915 916
                      'deploy_state': ', '.join(expected_states)})
916 917
 
917
-            next_step_index = self._get_node_next_deploy_steps(task)
918
+            info = node.driver_internal_info
919
+            try:
920
+                skip_current_step = info.pop('skip_current_deploy_step')
921
+            except KeyError:
922
+                skip_current_step = True
923
+            else:
924
+                node.driver_internal_info = info
925
+                node.save()
926
+
927
+            next_step_index = self._get_node_next_deploy_steps(
928
+                task, skip_current_step=skip_current_step)
918 929
 
919 930
             # TODO(rloo): When deprecation period is over and node is in
920 931
             # states.DEPLOYWAIT only, delete the 'if' and always 'resume'.
@@ -3836,6 +3847,16 @@ def _do_next_deploy_step(task, step_index, conductor_id):
3836 3847
         try:
3837 3848
             result = interface.execute_deploy_step(task, step)
3838 3849
         except exception.IronicException as e:
3850
+            if isinstance(e, exception.AgentConnectionFailed):
3851
+                if task.node.driver_internal_info.get('deployment_reboot'):
3852
+                    LOG.info('Agent is not yet running on node %(node)s after '
3853
+                             'deployment reboot, waiting for agent to come up '
3854
+                             'to run next deploy step %(step)s.',
3855
+                             {'node': node.uuid, 'step': step})
3856
+                    driver_internal_info['skip_current_deploy_step'] = False
3857
+                    node.driver_internal_info = driver_internal_info
3858
+                    task.process_event('wait')
3859
+                    return
3839 3860
             log_msg = ('Node %(node)s failed deploy step %(step)s. Error: '
3840 3861
                        '%(err)s' %
3841 3862
                        {'node': node.uuid, 'step': node.deploy_step, 'err': e})
@@ -3860,7 +3881,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
3860 3881
             node.save()
3861 3882
 
3862 3883
         # Check if the step is done or not. The step should return
3863
-        # states.CLEANWAIT if the step is still being executed, or
3884
+        # states.DEPLOYWAIT if the step is still being executed, or
3864 3885
         # None if the step is done.
3865 3886
         # NOTE(deva): Some drivers may return states.DEPLOYWAIT
3866 3887
         #             eg. if they are waiting for a callback
@@ -3890,6 +3911,7 @@ def _do_next_deploy_step(task, step_index, conductor_id):
3890 3911
     driver_internal_info = node.driver_internal_info
3891 3912
     driver_internal_info['deploy_steps'] = None
3892 3913
     driver_internal_info.pop('deploy_step_index', None)
3914
+    driver_internal_info.pop('deployment_reboot', None)
3893 3915
     node.driver_internal_info = driver_internal_info
3894 3916
     node.save()
3895 3917
 

+ 2
- 0
ironic/conductor/utils.py View File

@@ -461,6 +461,8 @@ def deploying_error_handler(task, logmsg, errmsg, traceback=False,
461 461
         node.deploy_step = {}
462 462
         info = node.driver_internal_info
463 463
         info.pop('deploy_step_index', None)
464
+        # Clear any leftover metadata about deployment reboots
465
+        info.pop('deployment_reboot', None)
464 466
         node.driver_internal_info = info
465 467
 
466 468
     if cleanup_err:

+ 8
- 1
ironic/drivers/modules/agent.py View File

@@ -463,7 +463,14 @@ class AgentDeploy(AgentDeployMixin, base.DeployInterface):
463 463
             task.process_event('wait')
464 464
             self.continue_deploy(task)
465 465
         elif task.driver.storage.should_write_image(task):
466
-            manager_utils.node_power_action(task, states.REBOOT)
466
+            # Check if the driver has already performed a reboot in a previous
467
+            # deploy step.
468
+            if not task.node.driver_internal_info.get('deployment_reboot'):
469
+                manager_utils.node_power_action(task, states.REBOOT)
470
+            info = task.node.driver_internal_info
471
+            info.pop('deployment_reboot', None)
472
+            task.node.driver_internal_info = info
473
+            task.node.save()
467 474
             return states.DEPLOYWAIT
468 475
         else:
469 476
             # TODO(TheJulia): At some point, we should de-dupe this code

+ 3
- 0
ironic/drivers/modules/agent_base_vendor.py View File

@@ -366,6 +366,9 @@ class HeartbeatMixin(object):
366 366
                     else:
367 367
                         node.touch_provisioning()
368 368
                 else:
369
+                    # The exceptions from RPC are not possible as we using cast
370
+                    # here
371
+                    manager_utils.notify_conductor_resume_deploy(task)
369 372
                     node.touch_provisioning()
370 373
             elif node.provision_state == states.CLEANWAIT:
371 374
                 node.touch_provisioning()

+ 10
- 1
ironic/drivers/modules/iscsi_deploy.py View File

@@ -421,7 +421,16 @@ class ISCSIDeploy(AgentDeployMixin, base.DeployInterface):
421 421
             # Standard deploy process
422 422
             deploy_utils.cache_instance_image(task.context, node)
423 423
             check_image_size(task)
424
-            manager_utils.node_power_action(task, states.REBOOT)
424
+            # Check if the driver has already performed a reboot in a previous
425
+            # deploy step.
426
+            if not task.node.driver_internal_info.get('deployment_reboot',
427
+                                                      False):
428
+                manager_utils.node_power_action(task, states.REBOOT)
429
+            info = task.node.driver_internal_info
430
+            info.pop('deployment_reboot', None)
431
+            task.node.driver_internal_info = info
432
+            task.node.save()
433
+
425 434
             return states.DEPLOYWAIT
426 435
         else:
427 436
             # Boot to an Storage Volume

+ 114
- 3
ironic/tests/unit/conductor/test_manager.py View File

@@ -1845,6 +1845,109 @@ class ContinueNodeDeployTestCase(mgr_utils.ServiceSetUpMixin,
1845 1845
                                       mock.ANY, 1, mock.ANY)
1846 1846
         self.assertFalse(mock_event.called)
1847 1847
 
1848
+    @mock.patch('ironic.conductor.manager.ConductorManager._spawn_worker',
1849
+                autospec=True)
1850
+    def _continue_node_deploy_skip_step(self, mock_spawn, skip=True):
1851
+        # test that skipping current step mechanism works
1852
+        driver_info = {'deploy_steps': self.deploy_steps,
1853
+                       'deploy_step_index': 0}
1854
+        if not skip:
1855
+            driver_info['skip_current_deploy_step'] = skip
1856
+        node = obj_utils.create_test_node(
1857
+            self.context, driver='fake-hardware',
1858
+            provision_state=states.DEPLOYWAIT,
1859
+            target_provision_state=states.MANAGEABLE,
1860
+            driver_internal_info=driver_info, deploy_step=self.deploy_steps[0])
1861
+        self._start_service()
1862
+        self.service.continue_node_deploy(self.context, node.uuid)
1863
+        self._stop_service()
1864
+        node.refresh()
1865
+        if skip:
1866
+            expected_step_index = 1
1867
+        else:
1868
+            self.assertNotIn(
1869
+                'skip_current_deploy_step', node.driver_internal_info)
1870
+            expected_step_index = 0
1871
+        mock_spawn.assert_called_with(mock.ANY, manager._do_next_deploy_step,
1872
+                                      mock.ANY, expected_step_index, mock.ANY)
1873
+
1874
+    def test_continue_node_deploy_skip_step(self):
1875
+        self._continue_node_deploy_skip_step()
1876
+
1877
+    def test_continue_node_deploy_no_skip_step(self):
1878
+        self._continue_node_deploy_skip_step(skip=False)
1879
+
1880
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
1881
+                autospec=True)
1882
+    def test_do_next_deploy_step_oob_reboot(self, mock_execute):
1883
+        # When a deploy step fails, go to DEPLOYWAIT
1884
+        tgt_prov_state = states.ACTIVE
1885
+
1886
+        self._start_service()
1887
+        node = obj_utils.create_test_node(
1888
+            self.context, driver='fake-hardware',
1889
+            provision_state=states.DEPLOYING,
1890
+            target_provision_state=tgt_prov_state,
1891
+            last_error=None,
1892
+            driver_internal_info={'deploy_steps': self.deploy_steps,
1893
+                                  'deploy_step_index': None,
1894
+                                  'deployment_reboot': True},
1895
+            clean_step={})
1896
+        mock_execute.side_effect = exception.AgentConnectionFailed(
1897
+            reason='failed')
1898
+
1899
+        with task_manager.acquire(
1900
+                self.context, node.uuid, shared=False) as task:
1901
+            manager._do_next_deploy_step(task, 0, mock.ANY)
1902
+
1903
+        self._stop_service()
1904
+        node.refresh()
1905
+
1906
+        # Make sure we go to CLEANWAIT
1907
+        self.assertEqual(states.DEPLOYWAIT, node.provision_state)
1908
+        self.assertEqual(tgt_prov_state, node.target_provision_state)
1909
+        self.assertEqual(self.deploy_steps[0], node.deploy_step)
1910
+        self.assertEqual(0, node.driver_internal_info['deploy_step_index'])
1911
+        self.assertFalse(node.driver_internal_info['skip_current_deploy_step'])
1912
+        mock_execute.assert_called_once_with(
1913
+            mock.ANY, mock.ANY, self.deploy_steps[0])
1914
+
1915
+    @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_deploy_step',
1916
+                autospec=True)
1917
+    def test_do_next_clean_step_oob_reboot_fail(self,
1918
+                                                mock_execute):
1919
+        # When a deploy step fails with no reboot requested go to DEPLOYFAIL
1920
+        tgt_prov_state = states.ACTIVE
1921
+
1922
+        self._start_service()
1923
+        node = obj_utils.create_test_node(
1924
+            self.context, driver='fake-hardware',
1925
+            provision_state=states.DEPLOYING,
1926
+            target_provision_state=tgt_prov_state,
1927
+            last_error=None,
1928
+            driver_internal_info={'deploy_steps': self.deploy_steps,
1929
+                                  'deploy_step_index': None},
1930
+            deploy_step={})
1931
+        mock_execute.side_effect = exception.AgentConnectionFailed(
1932
+            reason='failed')
1933
+
1934
+        with task_manager.acquire(
1935
+                self.context, node.uuid, shared=False) as task:
1936
+            manager._do_next_deploy_step(task, 0, mock.ANY)
1937
+
1938
+        self._stop_service()
1939
+        node.refresh()
1940
+
1941
+        # Make sure we go to DEPLOYFAIL, clear deploy_steps
1942
+        self.assertEqual(states.DEPLOYFAIL, node.provision_state)
1943
+        self.assertEqual(tgt_prov_state, node.target_provision_state)
1944
+        self.assertEqual({}, node.deploy_step)
1945
+        self.assertNotIn('deploy_step_index', node.driver_internal_info)
1946
+        self.assertNotIn('skip_current_deploy_step', node.driver_internal_info)
1947
+        self.assertIsNotNone(node.last_error)
1948
+        mock_execute.assert_called_once_with(
1949
+            mock.ANY, mock.ANY, self.deploy_steps[0])
1950
+
1848 1951
 
1849 1952
 @mgr_utils.mock_record_keepalive
1850 1953
 class DoNodeDeployTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
@@ -2641,7 +2744,7 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
2641 2744
         mock_execute.assert_called_once_with(mock.ANY, mock.ANY,
2642 2745
                                              self.deploy_steps[0])
2643 2746
 
2644
-    def test__get_node_next_deploy_steps(self):
2747
+    def _test__get_node_next_deploy_steps(self, skip=True):
2645 2748
         driver_internal_info = {'deploy_steps': self.deploy_steps,
2646 2749
                                 'deploy_step_index': 0}
2647 2750
         node = obj_utils.create_test_node(
@@ -2653,8 +2756,16 @@ class DoNextDeployStepTestCase(mgr_utils.ServiceSetUpMixin,
2653 2756
             deploy_step=self.deploy_steps[0])
2654 2757
 
2655 2758
         with task_manager.acquire(self.context, node.uuid) as task:
2656
-            step_index = self.service._get_node_next_deploy_steps(task)
2657
-            self.assertEqual(1, step_index)
2759
+            step_index = self.service._get_node_next_deploy_steps(
2760
+                task, skip_current_step=skip)
2761
+            expected_index = 1 if skip else 0
2762
+            self.assertEqual(expected_index, step_index)
2763
+
2764
+    def test__get_node_next_deploy_steps(self):
2765
+        self._test__get_node_next_deploy_steps()
2766
+
2767
+    def test__get_node_next_deploy_steps_no_skip(self):
2768
+        self._test__get_node_next_deploy_steps(skip=False)
2658 2769
 
2659 2770
     def test__get_node_next_deploy_steps_unset_deploy_step(self):
2660 2771
         driver_internal_info = {'deploy_steps': self.deploy_steps,

+ 1
- 0
ironic/tests/unit/conductor/test_utils.py View File

@@ -917,6 +917,7 @@ class DeployingErrorHandlerTestCase(tests_base.TestCase):
917 917
         self.assertEqual(self.errmsg, self.node.last_error)
918 918
         self.assertEqual({}, self.node.deploy_step)
919 919
         self.assertNotIn('deploy_step_index', self.node.driver_internal_info)
920
+        self.assertNotIn('deployment_reboot', self.node.driver_internal_info)
920 921
         self.task.process_event.assert_called_once_with('fail')
921 922
 
922 923
     def _test_deploying_error_handler_cleanup(self, exc, expected_str):

+ 17
- 0
ironic/tests/unit/drivers/modules/test_agent.py View File

@@ -321,6 +321,23 @@ class TestAgentDeploy(db_base.DbTestCase):
321 321
             power_mock.assert_called_once_with(task, states.REBOOT)
322 322
             self.assertFalse(mock_pxe_instance.called)
323 323
 
324
+    @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
325
+    @mock.patch('ironic.conductor.utils.node_power_action', autospec=True)
326
+    def test_deploy_with_deployment_reboot(self, power_mock,
327
+                                           mock_pxe_instance):
328
+        driver_internal_info = self.node.driver_internal_info
329
+        driver_internal_info['deployment_reboot'] = True
330
+        self.node.driver_internal_info = driver_internal_info
331
+        self.node.save()
332
+        with task_manager.acquire(
333
+                self.context, self.node['uuid'], shared=False) as task:
334
+            driver_return = self.driver.deploy(task)
335
+            self.assertEqual(driver_return, states.DEPLOYWAIT)
336
+            self.assertFalse(power_mock.called)
337
+            self.assertFalse(mock_pxe_instance.called)
338
+            self.assertNotIn(
339
+                'deployment_reboot', task.node.driver_internal_info)
340
+
324 341
     @mock.patch.object(pxe.PXEBoot, 'prepare_instance', autospec=True)
325 342
     @mock.patch.object(noop_storage.NoopStorage, 'should_write_image',
326 343
                        autospec=True)

+ 5
- 1
ironic/tests/unit/drivers/modules/test_agent_base_vendor.py View File

@@ -138,6 +138,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
138 138
             self.assertFalse(cd_mock.called)
139 139
             rti_mock.assert_called_once_with(self.deploy, task)
140 140
 
141
+    @mock.patch.object(manager_utils,
142
+                       'notify_conductor_resume_deploy', autospec=True)
141 143
     @mock.patch.object(agent_base_vendor.HeartbeatMixin,
142 144
                        'in_core_deploy_step', autospec=True)
143 145
     @mock.patch.object(agent_base_vendor.HeartbeatMixin,
@@ -151,7 +153,8 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
151 153
     def test_heartbeat_not_in_core_deploy_step(self, rti_mock, cd_mock,
152 154
                                                deploy_is_done_mock,
153 155
                                                deploy_started_mock,
154
-                                               in_deploy_mock):
156
+                                               in_deploy_mock,
157
+                                               in_resume_deploy_mock):
155 158
         # Check that heartbeats do not trigger deployment actions when not in
156 159
         # the deploy.deploy step.
157 160
         in_deploy_mock.return_value = False
@@ -170,6 +173,7 @@ class HeartbeatMixinTest(AgentDeployMixinBaseTest):
170 173
             self.assertFalse(deploy_is_done_mock.called)
171 174
             self.assertFalse(cd_mock.called)
172 175
             self.assertFalse(rti_mock.called)
176
+            self.assertTrue(in_resume_deploy_mock.called)
173 177
 
174 178
     @mock.patch.object(agent_base_vendor.HeartbeatMixin, 'continue_deploy',
175 179
                        autospec=True)

+ 21
- 0
ironic/tests/unit/drivers/modules/test_iscsi_deploy.py View File

@@ -795,6 +795,27 @@ class ISCSIDeployTestCase(db_base.DbTestCase):
795 795
             mock_check_image_size.assert_called_once_with(task)
796 796
             mock_node_power_action.assert_called_once_with(task, states.REBOOT)
797 797
 
798
+    @mock.patch.object(manager_utils, 'node_power_action', autospec=True)
799
+    @mock.patch.object(iscsi_deploy, 'check_image_size', autospec=True)
800
+    @mock.patch.object(deploy_utils, 'cache_instance_image', autospec=True)
801
+    def test_deploy_with_deployment_reboot(self, mock_cache_instance_image,
802
+                                           mock_check_image_size,
803
+                                           mock_node_power_action):
804
+        driver_internal_info = self.node.driver_internal_info
805
+        driver_internal_info['deployment_reboot'] = True
806
+        self.node.driver_internal_info = driver_internal_info
807
+        self.node.save()
808
+        with task_manager.acquire(self.context,
809
+                                  self.node.uuid, shared=False) as task:
810
+            state = task.driver.deploy.deploy(task)
811
+            self.assertEqual(state, states.DEPLOYWAIT)
812
+            mock_cache_instance_image.assert_called_once_with(
813
+                self.context, task.node)
814
+            mock_check_image_size.assert_called_once_with(task)
815
+            self.assertFalse(mock_node_power_action.called)
816
+            self.assertNotIn(
817
+                'deployment_reboot', task.node.driver_internal_info)
818
+
798 819
     @mock.patch.object(noop_storage.NoopStorage, 'should_write_image',
799 820
                        autospec=True)
800 821
     @mock.patch.object(flat_network.FlatNetwork,

+ 6
- 0
releasenotes/notes/fixes-execution-of-out-of-band-deploy-steps-1f5967e7bfcabbf9.yaml View File

@@ -0,0 +1,6 @@
1
+---
2
+fixes:
3
+  - |
4
+    Fixes an issue wherein asynchronous out-of-band deploy steps in
5
+    deployment template fails to execute. See `story 2006342
6
+    <https://storyboard.openstack.org/#!/story/2006342>`__ for details.

Loading…
Cancel
Save