Fix error handling in set_provision_state/set_power_state workflows

Currently these workflows succeed in any case, since we don't have any condition to fail. This change makes them fail if the resulting state does not match the expected one. It also handles the case when a node goes into one of the failure states, so that we don't wait until timeout. Proper error message is returned to avoid confusing operators. Finally, it reduces the traffic between mistral and ironic by only requesting the required fields. Partial-Bug: #1755754 Closes-Bug: #1667776 Change-Id: Ice19306d4c4a2080b0337bc02a6ccee4a81411b5
2018-03-14 14:20:15 +01:00 · 2018-03-14 14:20:15 +01:00 · d97cd4a005
commit d97cd4a005
parent 6090d32b51
1 changed files with 62 additions and 5 deletions
--- a/workbooks/baremetal.yaml
+++ b/workbooks/baremetal.yaml
@ -10,6 +10,14 @@ workflows:
      - node_uuid
      - state_action
      - target_state
+      - error_states:
+          # The default includes all failure states, even unused by TripleO.
+          - 'error'
+          - 'adopt failed'
+          - 'clean failed'
+          - 'deploy failed'
+          - 'inspect failed'
+          - 'rescue failed'

    tags:
      - tripleo-common-managed
@ -18,38 +26,83 @@ workflows:

      set_provision_state:
        on-success: wait_for_provision_state
+        on-error: set_provision_state_failed
        action: ironic.node_set_provision_state node_uuid=<% $.node_uuid %> state=<% $.state_action %>

+      set_provision_state_failed:
+        publish:
+          message: <% task(set_provision_state).result %>
+        on-complete: fail
+
      wait_for_provision_state:
-        action: ironic.node_get node_id=<% $.node_uuid %>
+        action: ironic.node_get
+        input:
+          node_id: <% $.node_uuid %>
+          fields: ['provision_state', 'last_error']
        timeout: 1200 #20 minutes
        retry:
          delay: 3
          count: 400
-          continue-on: <% task().result.provision_state != $.target_state %>
+          continue-on: <% not task().result.provision_state in [$.target_state] + $.error_states %>
+        on-complete:
+          - state_not_reached: <% task().result.provision_state != $.target_state %>
+
+      state_not_reached:
+        publish:
+          message: >-
+            Node <% $.node_uuid %> did not reach state "<% $.target_state %>",
+            the state is "<% task(wait_for_provision_state).result.provision_state %>",
+            error: <% task(wait_for_provision_state).result.last_error %>
+        on-complete: fail
+
+    output-on-error:
+      result: <% $.message %>

  set_power_state:
    input:
      - node_uuid
      - state_action
      - target_state
+      - error_state: 'error'

    tags:
      - tripleo-common-managed

    tasks:

-      set_provision_state:
+      set_power_state:
        on-success: wait_for_power_state
+        on-error: set_power_state_failed
        action: ironic.node_set_power_state node_id=<% $.node_uuid %> state=<% $.state_action %>

+      set_power_state_failed:
+        publish:
+          message: <% task(set_power_state).result %>
+        on-complete: fail
+
      wait_for_power_state:
-        action: ironic.node_get node_id=<% $.node_uuid %>
+        action: ironic.node_get
+        input:
+          node_id: <% $.node_uuid %>
+          fields: ['power_state', 'last_error']
        timeout: 120 #2 minutes
        retry:
          delay: 6
          count: 20
-          continue-on: <% task().result.power_state != $.target_state %>
+          continue-on: <% not task().result.power_state in [$.target_state, $.error_state] %>
+        on-complete:
+          - state_not_reached: <% task().result.power_state != $.target_state %>
+
+      state_not_reached:
+        publish:
+          message: >-
+            Node <% $.node_uuid %> did not reach power state "<% $.target_state %>",
+            the state is "<% task(wait_for_power_state).result.power_state %>",
+            error: <% task(wait_for_power_state).result.last_error %>
+        on-complete: fail
+
+    output-on-error:
+      result: <% $.message %>

  manual_cleaning:
    input:
@ -395,6 +448,10 @@ workflows:
          node_uuid: <% $.uuid %>
          state_action: 'manage'
          target_state: 'manageable'
+          error_states:
+            # node going back to enroll designates power credentials failure
+            - 'enroll'
+            - 'error'

      set_status_failed_nodes_manageable:
        on-success: send_message