Fix error handling in set_provision_state/set_power_state workflows

Currently these workflows succeed in any case, since we don't have
any condition to fail. This change makes them fail if the resulting
state does not match the expected one.

It also handles the case when a node goes into one of the failure
states, so that we don't wait until timeout. Proper error message
is returned to avoid confusing operators.

Finally, it reduces the traffic between mistral and ironic by only
requesting the required fields.

Partial-Bug: #1755754
Closes-Bug: #1667776
Change-Id: Ice19306d4c4a2080b0337bc02a6ccee4a81411b5
This commit is contained in:
Dmitry Tantsur 2018-03-14 14:20:15 +01:00
parent 6090d32b51
commit d97cd4a005

View File

@ -10,6 +10,14 @@ workflows:
- node_uuid
- state_action
- target_state
- error_states:
# The default includes all failure states, even unused by TripleO.
- 'error'
- 'adopt failed'
- 'clean failed'
- 'deploy failed'
- 'inspect failed'
- 'rescue failed'
tags:
- tripleo-common-managed
@ -18,38 +26,83 @@ workflows:
set_provision_state:
on-success: wait_for_provision_state
on-error: set_provision_state_failed
action: ironic.node_set_provision_state node_uuid=<% $.node_uuid %> state=<% $.state_action %>
set_provision_state_failed:
publish:
message: <% task(set_provision_state).result %>
on-complete: fail
wait_for_provision_state:
action: ironic.node_get node_id=<% $.node_uuid %>
action: ironic.node_get
input:
node_id: <% $.node_uuid %>
fields: ['provision_state', 'last_error']
timeout: 1200 #20 minutes
retry:
delay: 3
count: 400
continue-on: <% task().result.provision_state != $.target_state %>
continue-on: <% not task().result.provision_state in [$.target_state] + $.error_states %>
on-complete:
- state_not_reached: <% task().result.provision_state != $.target_state %>
state_not_reached:
publish:
message: >-
Node <% $.node_uuid %> did not reach state "<% $.target_state %>",
the state is "<% task(wait_for_provision_state).result.provision_state %>",
error: <% task(wait_for_provision_state).result.last_error %>
on-complete: fail
output-on-error:
result: <% $.message %>
set_power_state:
input:
- node_uuid
- state_action
- target_state
- error_state: 'error'
tags:
- tripleo-common-managed
tasks:
set_provision_state:
set_power_state:
on-success: wait_for_power_state
on-error: set_power_state_failed
action: ironic.node_set_power_state node_id=<% $.node_uuid %> state=<% $.state_action %>
set_power_state_failed:
publish:
message: <% task(set_power_state).result %>
on-complete: fail
wait_for_power_state:
action: ironic.node_get node_id=<% $.node_uuid %>
action: ironic.node_get
input:
node_id: <% $.node_uuid %>
fields: ['power_state', 'last_error']
timeout: 120 #2 minutes
retry:
delay: 6
count: 20
continue-on: <% task().result.power_state != $.target_state %>
continue-on: <% not task().result.power_state in [$.target_state, $.error_state] %>
on-complete:
- state_not_reached: <% task().result.power_state != $.target_state %>
state_not_reached:
publish:
message: >-
Node <% $.node_uuid %> did not reach power state "<% $.target_state %>",
the state is "<% task(wait_for_power_state).result.power_state %>",
error: <% task(wait_for_power_state).result.last_error %>
on-complete: fail
output-on-error:
result: <% $.message %>
manual_cleaning:
input:
@ -395,6 +448,10 @@ workflows:
node_uuid: <% $.uuid %>
state_action: 'manage'
target_state: 'manageable'
error_states:
# node going back to enroll designates power credentials failure
- 'enroll'
- 'error'
set_status_failed_nodes_manageable:
on-success: send_message