tripleo-common/workbooks/baremetal.yaml
Dmitry Tantsur d97cd4a005 Fix error handling in set_provision_state/set_power_state workflows
Currently these workflows succeed in any case, since we don't have
any condition to fail. This change makes them fail if the resulting
state does not match the expected one.

It also handles the case when a node goes into one of the failure
states, so that we don't wait until timeout. Proper error message
is returned to avoid confusing operators.

Finally, it reduces the traffic between mistral and ironic by only
requesting the required fields.

Partial-Bug: #1755754
Closes-Bug: #1667776
Change-Id: Ice19306d4c4a2080b0337bc02a6ccee4a81411b5
2018-04-04 17:33:38 +02:00

1318 lines
41 KiB
YAML

---
version: '2.0'
name: tripleo.baremetal.v1
description: TripleO Baremetal Workflows
workflows:
set_node_state:
input:
- node_uuid
- state_action
- target_state
- error_states:
# The default includes all failure states, even unused by TripleO.
- 'error'
- 'adopt failed'
- 'clean failed'
- 'deploy failed'
- 'inspect failed'
- 'rescue failed'
tags:
- tripleo-common-managed
tasks:
set_provision_state:
on-success: wait_for_provision_state
on-error: set_provision_state_failed
action: ironic.node_set_provision_state node_uuid=<% $.node_uuid %> state=<% $.state_action %>
set_provision_state_failed:
publish:
message: <% task(set_provision_state).result %>
on-complete: fail
wait_for_provision_state:
action: ironic.node_get
input:
node_id: <% $.node_uuid %>
fields: ['provision_state', 'last_error']
timeout: 1200 #20 minutes
retry:
delay: 3
count: 400
continue-on: <% not task().result.provision_state in [$.target_state] + $.error_states %>
on-complete:
- state_not_reached: <% task().result.provision_state != $.target_state %>
state_not_reached:
publish:
message: >-
Node <% $.node_uuid %> did not reach state "<% $.target_state %>",
the state is "<% task(wait_for_provision_state).result.provision_state %>",
error: <% task(wait_for_provision_state).result.last_error %>
on-complete: fail
output-on-error:
result: <% $.message %>
set_power_state:
input:
- node_uuid
- state_action
- target_state
- error_state: 'error'
tags:
- tripleo-common-managed
tasks:
set_power_state:
on-success: wait_for_power_state
on-error: set_power_state_failed
action: ironic.node_set_power_state node_id=<% $.node_uuid %> state=<% $.state_action %>
set_power_state_failed:
publish:
message: <% task(set_power_state).result %>
on-complete: fail
wait_for_power_state:
action: ironic.node_get
input:
node_id: <% $.node_uuid %>
fields: ['power_state', 'last_error']
timeout: 120 #2 minutes
retry:
delay: 6
count: 20
continue-on: <% not task().result.power_state in [$.target_state, $.error_state] %>
on-complete:
- state_not_reached: <% task().result.power_state != $.target_state %>
state_not_reached:
publish:
message: >-
Node <% $.node_uuid %> did not reach power state "<% $.target_state %>",
the state is "<% task(wait_for_power_state).result.power_state %>",
error: <% task(wait_for_power_state).result.last_error %>
on-complete: fail
output-on-error:
result: <% $.message %>
manual_cleaning:
input:
- node_uuid
- clean_steps
- timeout: 7200 # 2 hours (cleaning can take really long)
- retry_delay: 10
- retry_count: 720
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
set_provision_state:
on-success: wait_for_provision_state
on-error: set_provision_state_failed
action: ironic.node_set_provision_state node_uuid=<% $.node_uuid %> state='clean' cleansteps=<% $.clean_steps %>
set_provision_state_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(set_provision_state).result %>
wait_for_provision_state:
on-success: send_message
action: ironic.node_get node_id=<% $.node_uuid %>
timeout: <% $.timeout %>
retry:
delay: <% $.retry_delay %>
count: <% $.retry_count %>
continue-on: <% task().result.provision_state != 'manageable' %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.manual_cleaning
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
validate_nodes:
description: Validate nodes JSON
input:
- nodes_json
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
validate_nodes:
action: tripleo.baremetal.validate_nodes
on-success: send_message
on-error: validation_failed
input:
nodes_json: <% $.nodes_json %>
validation_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(validate_nodes).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.validate_nodes
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
register_or_update:
description: Take nodes JSON and create nodes in a "manageable" state
input:
- nodes_json
- remove: False
- queue_name: tripleo
- kernel_name: null
- ramdisk_name: null
- instance_boot_option: local
- initial_state: manageable
tags:
- tripleo-common-managed
tasks:
validate_input:
workflow: tripleo.baremetal.v1.validate_nodes
on-success: register_or_update_nodes
on-error: validation_failed
input:
nodes_json: <% $.nodes_json %>
queue_name: <% $.queue_name %>
validation_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(validate_input).result %>
registered_nodes: []
register_or_update_nodes:
action: tripleo.baremetal.register_or_update_nodes
on-success:
- set_nodes_managed: <% $.initial_state != "enroll" %>
- send_message: <% $.initial_state = "enroll" %>
on-error: set_status_failed_register_or_update_nodes
input:
nodes_json: <% $.nodes_json %>
remove: <% $.remove %>
kernel_name: <% $.kernel_name %>
ramdisk_name: <% $.ramdisk_name %>
instance_boot_option: <% $.instance_boot_option %>
publish:
registered_nodes: <% task().result %>
new_nodes: <% task().result.where($.provision_state = 'enroll') %>
set_status_failed_register_or_update_nodes:
on-success: send_message
publish:
status: FAILED
message: <% task(register_or_update_nodes).result %>
registered_nodes: []
set_nodes_managed:
on-success:
- set_nodes_available: <% $.initial_state = "available" %>
- send_message: <% $.initial_state != "available" %>
on-error: set_status_failed_nodes_managed
workflow: tripleo.baremetal.v1.manage
input:
node_uuids: <% $.new_nodes.uuid %>
queue_name: <% $.queue_name %>
publish:
status: SUCCESS
message: <% $.new_nodes.len() %> node(s) successfully moved to the "manageable" state.
set_status_failed_nodes_managed:
on-success: send_message
publish:
status: FAILED
message: <% task(set_nodes_managed).result %>
set_nodes_available:
on-success: send_message
on-error: set_status_failed_nodes_available
workflow: tripleo.baremetal.v1.provide node_uuids=<% $.new_nodes.uuid %> queue_name=<% $.queue_name %>
publish:
status: SUCCESS
message: <% $.new_nodes.len() %> node(s) successfully moved to the "available" state.
set_status_failed_nodes_available:
on-success: send_message
publish:
status: FAILED
message: <% task(set_nodes_available).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.register_or_update
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
registered_nodes: <% $.registered_nodes or [] %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
provide:
description: Take a list of nodes and move them to "available"
input:
- node_uuids
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
set_nodes_available:
on-success: cell_v2_discover_hosts
on-error: set_status_failed_nodes_available
with-items: uuid in <% $.node_uuids %>
workflow: tripleo.baremetal.v1.set_node_state
input:
node_uuid: <% $.uuid %>
queue_name: <% $.queue_name %>
state_action: 'provide'
target_state: 'available'
set_status_failed_nodes_available:
on-success: send_message
publish:
status: FAILED
message: <% task(set_nodes_available).result %>
cell_v2_discover_hosts:
on-success: try_power_off
on-error: cell_v2_discover_hosts_failed
workflow: tripleo.baremetal.v1.cellv2_discovery
input:
node_uuids: <% $.node_uuids %>
queue_name: <% $.queue_name %>
timeout: 900 #15 minutes
retry:
delay: 30
count: 30
cell_v2_discover_hosts_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(cell_v2_discover_hosts).result %>
try_power_off:
on-success: send_message
on-error: power_off_failed
with-items: uuid in <% $.node_uuids %>
workflow: tripleo.baremetal.v1.set_power_state
input:
node_uuid: <% $.uuid %>
queue_name: <% $.queue_name %>
state_action: 'off'
target_state: 'power off'
publish:
status: SUCCESS
message: <% $.node_uuids.len() %> node(s) successfully moved to the "available" state.
power_off_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(try_power_off).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.provide
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
provide_manageable_nodes:
description: Provide all nodes in a 'manageable' state.
input:
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
get_manageable_nodes:
action: ironic.node_list maintenance=False associated=False
on-success: provide_manageable
on-error: set_status_failed_get_manageable_nodes
publish:
managed_nodes: <% task().result.where($.provision_state = 'manageable').uuid %>
set_status_failed_get_manageable_nodes:
on-success: send_message
publish:
status: FAILED
message: <% task(get_manageable_nodes).result %>
provide_manageable:
on-success: send_message
workflow: tripleo.baremetal.v1.provide
input:
node_uuids: <% $.managed_nodes %>
queue_name: <% $.queue_name %>
publish:
status: SUCCESS
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.provide_manageable_nodes
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
manage:
description: Set a list of nodes to 'manageable' state
input:
- node_uuids
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
set_nodes_manageable:
on-success: send_message
on-error: set_status_failed_nodes_manageable
with-items: uuid in <% $.node_uuids %>
workflow: tripleo.baremetal.v1.set_node_state
input:
node_uuid: <% $.uuid %>
state_action: 'manage'
target_state: 'manageable'
error_states:
# node going back to enroll designates power credentials failure
- 'enroll'
- 'error'
set_status_failed_nodes_manageable:
on-success: send_message
publish:
status: FAILED
message: <% task(set_nodes_manageable).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.manage
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
_introspect:
description: >
An internal workflow. The tripleo.baremetal.v1.introspect workflow
should be used for introspection.
input:
- node_uuid
- timeout
- queue_name
output:
result: <% task(start_introspection).result %>
tags:
- tripleo-common-managed
tasks:
start_introspection:
action: baremetal_introspection.introspect uuid=<% $.node_uuid %>
on-success: wait_for_introspection_to_finish
on-error: set_status_failed_start_introspection
set_status_failed_start_introspection:
publish:
status: FAILED
message: <% task(start_introspection).result %>
introspected_nodes: []
on-success: send_message
wait_for_introspection_to_finish:
action: baremetal_introspection.wait_for_finish
input:
uuids: <% [$.node_uuid] %>
# The interval is 10 seconds, so divide to make the overall timeout
# in seconds correct.
max_retries: <% $.timeout / 10 %>
retry_interval: 10
publish:
introspected_node: <% task().result.values().first() %>
status: <% bool(task().result.values().first().error) and "FAILED" or "SUCCESS" %>
publish-on-error:
status: FAILED
message: <% task().result %>
on-success: wait_for_introspection_to_finish_success
on-error: wait_for_introspection_to_finish_error
wait_for_introspection_to_finish_success:
publish:
message: <% "Introspection of node {0} completed. Status:{1}. Errors:{2}".format($.introspected_node.uuid, $.status, $.introspected_node.error) %>
on-success: send_message
wait_for_introspection_to_finish_error:
publish:
message: <% "Introspection of node {0} timed out.".format($.node_uuid) %>
on-success: send_message
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1._introspect
payload:
status: <% $.status %>
message: <% $.message %>
introspected_node: <% $.get('introspected_node') %>
node_uuid: <% $.node_uuid %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
introspect:
description: >
Take a list of nodes and move them through introspection.
By default each node will attempt introspection up to 3 times (two
retries plus the initial attemp) if it fails. This behaviour can be
modified by changing the max_retry_attempts input.
The workflow will assume the node has timed out after 20 minutes (1200
seconds). This can be changed by passing the node_timeout input in
seconds.
input:
- node_uuids
- run_validations: False
- queue_name: tripleo
- concurrency: 20
- max_retry_attempts: 2
- node_timeout: 1200
tags:
- tripleo-common-managed
task-defaults:
on-error: unhandled_error
tasks:
initialize:
publish:
introspection_attempt: 1
on-complete:
- run_validations: <% $.run_validations %>
- introspect_nodes: <% not $.run_validations %>
run_validations:
workflow: tripleo.validations.v1.run_groups
input:
group_names:
- 'pre-introspection'
queue_name: <% $.queue_name %>
on-success: introspect_nodes
on-error: set_validations_failed
set_validations_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(run_validations).result %>
introspect_nodes:
with-items: uuid in <% $.node_uuids %>
concurrency: <% $.concurrency %>
workflow: _introspect
input:
node_uuid: <% $.uuid %>
queue_name: <% $.queue_name %>
timeout: <% $.node_timeout %>
# on-error is triggered if one or more nodes failed introspection. We
# still go to get_introspection_status as it will collect the result
# for each node. Unless we hit the retry limit.
on-error:
- get_introspection_status: <% $.introspection_attempt <= $.max_retry_attempts %>
- max_retry_attempts_reached: <% $.introspection_attempt > $.max_retry_attempts %>
on-success: get_introspection_status
get_introspection_status:
with-items: uuid in <% $.node_uuids %>
action: baremetal_introspection.get_status
input:
uuid: <% $.uuid %>
publish:
introspected_nodes: <% task().result.toDict($.uuid, $) %>
# Currently there is no way for us to ignore user introspection
# aborts. This means we will retry aborted nodes until the Ironic API
# gives us more details (error code or a boolean to show aborts etc.)
# If a node hasn't finished, we consider it to be failed.
# TODO(d0ugal): When possible, don't retry introspection of nodes
# that a user manually aborted.
failed_introspection: <% task().result.where($.finished = true and $.error != null).select($.uuid) + task().result.where($.finished = false).select($.uuid) %>
publish-on-error:
# If a node fails to start introspection, getting the status can fail.
# When that happens, the result is a string and the nodes need to be
# filtered out.
introspected_nodes: <% task().result.where(isDict($)).toDict($.uuid, $) %>
# If there was an error, the exception string we get doesn't give us
# the UUID. So we use a set difference to find the UUIDs missing in
# the results. These are then added to the failed nodes.
failed_introspection: <% ($.node_uuids.toSet() - task().result.where(isDict($)).select($.uuid).toSet()) + task().result.where(isDict($)).where($.finished = true and $.error != null).toSet() + task().result.where(isDict($)).where($.finished = false).toSet() %>
on-error: increase_attempt_counter
on-success:
- successful_introspection: <% $.failed_introspection.len() = 0 %>
- increase_attempt_counter: <% $.failed_introspection.len() > 0 %>
increase_attempt_counter:
publish:
introspection_attempt: <% $.introspection_attempt + 1 %>
on-complete:
retry_failed_nodes
retry_failed_nodes:
publish:
status: RUNNING
message: <% 'Retrying {0} nodes that failed introspection. Attempt {1} of {2} '.format($.failed_introspection.len(), $.introspection_attempt, $.max_retry_attempts + 1) %>
# We are about to retry, update the tracking stats.
node_uuids: <% $.failed_introspection %>
on-success:
- send_message
- introspect_nodes
max_retry_attempts_reached:
publish:
status: FAILED
message: <% 'Retry limit reached with {0} nodes still failing introspection'.format($.failed_introspection.len()) %>
on-complete: send_message
successful_introspection:
publish:
status: SUCCESS
message: Successfully introspected <% $.introspected_nodes.len() %> node(s).
on-complete: send_message
unhandled_error:
publish:
status: FAILED
message: "Unhandled workflow error"
on-complete: send_message
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.introspect
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
introspected_nodes: <% $.get('introspected_nodes', []) %>
failed_introspection: <% $.get('failed_introspection', []) %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
introspect_manageable_nodes:
description: Introspect all nodes in a 'manageable' state.
input:
- run_validations: False
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
get_manageable_nodes:
action: ironic.node_list maintenance=False associated=False
on-success: validate_nodes
on-error: set_status_failed_get_manageable_nodes
publish:
managed_nodes: <% task().result.where($.provision_state = 'manageable').uuid %>
set_status_failed_get_manageable_nodes:
on-success: send_message
publish:
status: FAILED
message: <% task(get_manageable_nodes).result %>
validate_nodes:
on-success:
- introspect_manageable: <% $.managed_nodes.len() > 0 %>
- set_status_failed_no_nodes: <% $.managed_nodes.len() = 0 %>
set_status_failed_no_nodes:
on-success: send_message
publish:
status: FAILED
message: No manageable nodes to introspect. Check node states and maintenance.
introspect_manageable:
on-success: send_message
on-error: set_status_introspect_manageable
workflow: tripleo.baremetal.v1.introspect
input:
node_uuids: <% $.managed_nodes %>
run_validations: <% $.run_validations %>
queue_name: <% $.queue_name %>
publish:
introspected_nodes: <% task().result.introspected_nodes %>
set_status_introspect_manageable:
on-success: send_message
publish:
status: FAILED
message: <% task(introspect_manageable).result %>
introspected_nodes: []
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.introspect_manageable_nodes
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
introspected_nodes: <% $.get('introspected_nodes', []) %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
configure:
description: Take a list of manageable nodes and update their boot configuration.
input:
- node_uuids
- queue_name: tripleo
- kernel_name: bm-deploy-kernel
- ramdisk_name: bm-deploy-ramdisk
- instance_boot_option: null
- root_device: null
- root_device_minimum_size: 4
- overwrite_root_device_hints: False
tags:
- tripleo-common-managed
tasks:
configure_boot:
on-success: configure_root_device
on-error: set_status_failed_configure_boot
with-items: node_uuid in <% $.node_uuids %>
action: tripleo.baremetal.configure_boot node_uuid=<% $.node_uuid %> kernel_name=<% $.kernel_name %> ramdisk_name=<% $.ramdisk_name %> instance_boot_option=<% $.instance_boot_option %>
configure_root_device:
on-success: send_message
on-error: set_status_failed_configure_root_device
with-items: node_uuid in <% $.node_uuids %>
action: tripleo.baremetal.configure_root_device node_uuid=<% $.node_uuid %> root_device=<% $.root_device %> minimum_size=<% $.root_device_minimum_size %> overwrite=<% $.overwrite_root_device_hints %>
publish:
status: SUCCESS
message: 'Successfully configured the nodes.'
set_status_failed_configure_boot:
on-success: send_message
publish:
status: FAILED
message: <% task(configure_boot).result %>
set_status_failed_configure_root_device:
on-success: send_message
publish:
status: FAILED
message: <% task(configure_root_device).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.configure
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
configure_manageable_nodes:
description: Update the boot configuration of all nodes in 'manageable' state.
input:
- queue_name: tripleo
- kernel_name: 'bm-deploy-kernel'
- ramdisk_name: 'bm-deploy-ramdisk'
- instance_boot_option: null
- root_device: null
- root_device_minimum_size: 4
- overwrite_root_device_hints: False
tags:
- tripleo-common-managed
tasks:
get_manageable_nodes:
action: ironic.node_list maintenance=False associated=False
on-success: configure_manageable
on-error: set_status_failed_get_manageable_nodes
publish:
managed_nodes: <% task().result.where($.provision_state = 'manageable').uuid %>
configure_manageable:
on-success: send_message
on-error: set_status_failed_configure_manageable
workflow: tripleo.baremetal.v1.configure
input:
node_uuids: <% $.managed_nodes %>
queue_name: <% $.queue_name %>
kernel_name: <% $.kernel_name %>
ramdisk_name: <% $.ramdisk_name %>
instance_boot_option: <% $.instance_boot_option %>
root_device: <% $.root_device %>
root_device_minimum_size: <% $.root_device_minimum_size %>
overwrite_root_device_hints: <% $.overwrite_root_device_hints %>
publish:
message: 'Manageable nodes configured successfully.'
set_status_failed_configure_manageable:
on-success: send_message
publish:
status: FAILED
message: <% task(configure_manageable).result %>
set_status_failed_get_manageable_nodes:
on-success: send_message
publish:
status: FAILED
message: <% task(get_manageable_nodes).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.configure_manageable_nodes
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
tag_node:
description: Tag a node with a role
input:
- node_uuid
- role: null
- queue_name: tripleo
task-defaults:
on-error: send_message
tags:
- tripleo-common-managed
tasks:
update_node:
on-success: send_message
action: tripleo.baremetal.update_node_capability node_uuid=<% $.node_uuid %> capability='profile' value=<% $.role %>
publish:
message: <% task().result %>
status: SUCCESS
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.tag_node
payload:
status: <% $.get('status', 'FAILED') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
tag_nodes:
description: Runs the tag_node workflow in a loop
input:
- tag_node_uuids
- untag_node_uuids
- role
- plan: overcloud
- queue_name: tripleo
task-defaults:
on-error: send_message
tags:
- tripleo-common-managed
tasks:
tag_nodes:
with-items: node_uuid in <% $.tag_node_uuids %>
workflow: tripleo.baremetal.v1.tag_node
input:
node_uuid: <% $.node_uuid %>
queue_name: <% $.queue_name %>
role: <% $.role %>
concurrency: 1
on-success: untag_nodes
untag_nodes:
with-items: node_uuid in <% $.untag_node_uuids %>
workflow: tripleo.baremetal.v1.tag_node
input:
node_uuid: <% $.node_uuid %>
queue_name: <% $.queue_name %>
concurrency: 1
on-success: update_role_parameters
update_role_parameters:
on-success: send_message
action: tripleo.parameters.update_role role=<% $.role %> container=<% $.plan %>
publish:
message: <% task().result %>
status: SUCCESS
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.tag_nodes
payload:
status: <% $.get('status', 'FAILED') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
nodes_with_profile:
description: Find nodes with a specific profile
input:
- profile
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
get_active_nodes:
action: ironic.node_list maintenance=false provision_state='active' detail=true
on-success: get_available_nodes
on-error: set_status_failed_get_active_nodes
get_available_nodes:
action: ironic.node_list maintenance=false provision_state='available' detail=true
on-success: get_matching_nodes
on-error: set_status_failed_get_available_nodes
get_matching_nodes:
with-items: node in <% task(get_available_nodes).result + task(get_active_nodes).result %>
action: tripleo.baremetal.get_profile node=<% $.node %>
on-success: send_message
on-error: set_status_failed_get_matching_nodes
publish:
matching_nodes: <% let(input_profile_name => $.profile) -> task().result.where($.profile = $input_profile_name).uuid %>
set_status_failed_get_active_nodes:
on-success: send_message
publish:
status: FAILED
message: <% task(get_active_nodes).result %>
set_status_failed_get_available_nodes:
on-success: send_message
publish:
status: FAILED
message: <% task(get_available_nodes).result %>
set_status_failed_get_matching_nodes:
on-success: send_message
publish:
status: FAILED
message: <% task(get_matching_nodes).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.nodes_with_profile
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
matching_nodes: <% $.matching_nodes or [] %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
create_raid_configuration:
description: Create and apply RAID configuration for given nodes
input:
- node_uuids
- configuration
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
set_configuration:
with-items: node_uuid in <% $.node_uuids %>
action: ironic.node_set_target_raid_config node_ident=<% $.node_uuid %> target_raid_config=<% $.configuration %>
on-success: apply_configuration
on-error: set_configuration_failed
set_configuration_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(set_configuration).result %>
apply_configuration:
with-items: node_uuid in <% $.node_uuids %>
workflow: tripleo.baremetal.v1.manual_cleaning
input:
node_uuid: <% $.node_uuid %>
clean_steps:
- interface: raid
step: delete_configuration
- interface: raid
step: create_configuration
timeout: 1800 # building RAID should be fast than general cleaning
retry_count: 180
retry_delay: 10
on-success: send_message
on-error: apply_configuration_failed
publish:
message: <% task().result %>
status: SUCCESS
apply_configuration_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(apply_configuration).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.create_raid_configuration
payload:
status: <% $.get('status', 'FAILED') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
cellv2_discovery:
description: Run cell_v2 host discovery
input:
- node_uuids
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
cell_v2_discover_hosts:
on-success: wait_for_nova_resources
on-error: cell_v2_discover_hosts_failed
action: tripleo.baremetal.cell_v2_discover_hosts
cell_v2_discover_hosts_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(cell_v2_discover_hosts).result %>
wait_for_nova_resources:
on-success: send_message
on-error: wait_for_nova_resources_failed
with-items: node_uuid in <% $.node_uuids %>
action: nova.hypervisors_find hypervisor_hostname=<% $.node_uuid %>
wait_for_nova_resources_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(wait_for_nova_resources).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.cellv2_discovery
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
discover_nodes:
description: Run nodes discovery over the given IP range
input:
- ip_addresses
- credentials
- ports: [623]
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
get_all_nodes:
action: ironic.node_list
input:
fields: ["uuid", "driver", "driver_info"]
limit: 0
on-success: get_candidate_nodes
on-error: get_all_nodes_failed
publish:
existing_nodes: <% task().result %>
get_all_nodes_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(get_all_nodes).result %>
get_candidate_nodes:
action: tripleo.baremetal.get_candidate_nodes
input:
ip_addresses: <% $.ip_addresses %>
credentials: <% $.credentials %>
ports: <% $.ports %>
existing_nodes: <% $.existing_nodes %>
on-success: probe_nodes
on-error: get_candidate_nodes_failed
publish:
candidates: <% task().result %>
get_candidate_nodes_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(get_candidate_nodes).result %>
probe_nodes:
action: tripleo.baremetal.probe_node
on-success: send_message
on-error: probe_nodes_failed
input:
ip: <% $.node.ip %>
port: <% $.node.port %>
username: <% $.node.username %>
password: <% $.node.password %>
with-items:
- node in <% $.candidates %>
publish:
nodes_json: <% task().result.where($ != null) %>
probe_nodes_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(probe_nodes).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.discover_nodes
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
nodes_json: <% $.get('nodes_json', []) %>
on-success:
- fail: <% $.get('status') = "FAILED" %>
discover_and_enroll_nodes:
description: Run nodes discovery over the given IP range and enroll nodes
input:
- ip_addresses
- credentials
- ports: [623]
- kernel_name: null
- ramdisk_name: null
- instance_boot_option: local
- initial_state: manageable
- queue_name: tripleo
tags:
- tripleo-common-managed
tasks:
discover_nodes:
workflow: tripleo.baremetal.v1.discover_nodes
input:
ip_addresses: <% $.ip_addresses %>
ports: <% $.ports %>
credentials: <% $.credentials %>
queue_name: <% $.queue_name %>
on-success: enroll_nodes
on-error: discover_nodes_failed
publish:
nodes_json: <% task().result.nodes_json %>
discover_nodes_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(discover_nodes).result %>
enroll_nodes:
workflow: tripleo.baremetal.v1.register_or_update
input:
nodes_json: <% $.nodes_json %>
kernel_name: <% $.kernel_name %>
ramdisk_name: <% $.ramdisk_name %>
instance_boot_option: <% $.instance_boot_option %>
initial_state: <% $.initial_state %>
on-success: send_message
on-error: enroll_nodes_failed
publish:
registered_nodes: <% task().result.registered_nodes %>
enroll_nodes_failed:
on-success: send_message
publish:
status: FAILED
message: <% task(enroll_nodes).result %>
send_message:
action: zaqar.queue_post
retry: count=5 delay=1
input:
queue_name: <% $.queue_name %>
messages:
body:
type: tripleo.baremetal.v1.discover_and_enroll_nodes
payload:
status: <% $.get('status', 'SUCCESS') %>
message: <% $.get('message', '') %>
execution: <% execution() %>
registered_nodes: <% $.get('registered_nodes', []) %>
on-success:
- fail: <% $.get('status') = "FAILED" %>