Retry heat stack polling on gateway problems

If the heat api is overloaded or temporarily unavailable, we might get a 503 or 504 from haproxy during the deployment. We should retry polling for events in this case as to not prematurely exit the deployment. Change-Id: I947cd0f9bf4a97e46c3d2bf3e9b986f7d38e9357 Closes-Bug: #1833452
2019-06-19 13:45:23 -06:00
parent 4bfd472b38
commit cb42cfe30f
2 changed files with 81 additions and 6 deletions
--- a/tripleoclient/tests/test_utils.py
+++ b/tripleoclient/tests/test_utils.py
@@ -376,6 +376,54 @@ class TestWaitForStackUtil(TestCase):
        complete = utils.wait_for_stack_ready(self.mock_orchestration, 'stack')
        self.assertTrue(complete)
    @mock.patch("time.sleep")
    @mock.patch("heatclient.common.event_utils.poll_for_events")
    @mock.patch("tripleoclient.utils.get_stack")
    def test_wait_for_stack_ready_retry(self, mock_get_stack, mock_poll,
                                        mock_time):
        stack = mock.Mock()
        stack.stack_name = 'stack'
        stack.stack_id = 'id'
        stack.stack_status = "CREATE_COMPLETE"
        mock_get_stack.return_value = stack
        mock_poll.side_effect = [hc_exc.HTTPException(code=504),
                                 ("CREATE_COMPLETE", "ready retry message")]
        complete = utils.wait_for_stack_ready(self.mock_orchestration, 'stack')
        self.assertTrue(complete)
    @mock.patch("time.sleep")
    @mock.patch("heatclient.common.event_utils.poll_for_events")
    @mock.patch("tripleoclient.utils.get_stack")
    def test_wait_for_stack_ready_retry_fail(self, mock_get_stack, mock_poll,
                                             mock_time):
        stack = mock.Mock()
        stack.stack_name = 'stack'
        stack.stack_id = 'id'
        stack.stack_status = "CREATE_COMPLETE"
        mock_get_stack.return_value = stack
        mock_poll.side_effect = hc_exc.HTTPException(code=504)
        self.assertRaises(RuntimeError,
                          utils.wait_for_stack_ready,
                          self.mock_orchestration, 'stack')
    @mock.patch("time.sleep")
    @mock.patch("heatclient.common.event_utils.poll_for_events")
    @mock.patch("tripleoclient.utils.get_stack")
    def test_wait_for_stack_ready_server_fail(self, mock_get_stack, mock_poll,
                                              mock_time):
        stack = mock.Mock()
        stack.stack_name = 'stack'
        stack.stack_id = 'id'
        stack.stack_status = "CREATE_COMPLETE"
        mock_get_stack.return_value = stack
        mock_poll.side_effect = hc_exc.HTTPException(code=500)
        self.assertRaises(hc_exc.HTTPException,
                          utils.wait_for_stack_ready,
                          self.mock_orchestration, 'stack')
    def test_wait_for_stack_ready_no_stack(self):
        self.mock_orchestration.stacks.get.return_value = None
--- a/tripleoclient/utils.py
+++ b/tripleoclient/utils.py
@@ -418,7 +418,8 @@ def create_tempest_deployer_input(config_name='tempest-deployer-input.conf'):
 def wait_for_stack_ready(orchestration_client, stack_name, marker=None,
-                         action='CREATE', verbose=False):
+                         action='CREATE', verbose=False, poll_period=5,
                         nested_depth=2, max_retries=10):
    """Check the status of an orchestration stack
    Get the status of an orchestration stack and check whether it is complete
@@ -438,7 +439,17 @@ def wait_for_stack_ready(orchestration_client, stack_name, marker=None,
    :param verbose: Whether to print events
    :type verbose: boolean
    :param nested_depth: Max depth to look for events
    :type nested_depth: int
    :param poll_period: How often to poll for events
    :type poll_period: int
    :param max_retries: Number of retries in the case of server problems
    :type max_retries: int
    """
    log = logging.getLogger(__name__ + ".wait_for_stack_ready")
    stack = get_stack(orchestration_client, stack_name)
    if not stack:
        return False
@@ -448,11 +459,27 @@ def wait_for_stack_ready(orchestration_client, stack_name, marker=None,
        out = sys.stdout
    else:
        out = open(os.devnull, "w")
-    stack_status, msg = event_utils.poll_for_events(
+    retries = 0
-        orchestration_client, stack_name, action=action,
+    while retries <= max_retries:
-        poll_period=5, marker=marker, out=out, nested_depth=2)
+        try:
-    print(msg)
+            stack_status, msg = event_utils.poll_for_events(
-    return stack_status == '%s_COMPLETE' % action
+                orchestration_client, stack_name, action=action,
                poll_period=5, marker=marker, out=out,
                nested_depth=nested_depth)
            print(msg)
            return stack_status == '%s_COMPLETE' % action
        except hc_exc.HTTPException as e:
            if e.code in [503, 504]:
                retries += 1
                log.warning("Server issue while waiting for stack to be ready."
                            " Attempting retry {} of {}".format(retries,
                                                                max_retries))
                time.sleep(retries * 5)
                continue
            log.error("Error occured while waiting for stack to be ready.")
            raise e
    raise RuntimeError(
        "wait_for_stack_ready: Max retries {} reached".format(max_retries))
 def wait_for_provision_state(baremetal_client, node_uuid, provision_state,