OpenStack Orchestration (Heat) Templates
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
heat-templates/hot/autohealing/autohealing_server.yaml

171 lines
5.3 KiB

heat_template_version: 2017-02-24
description: >
A stack containing a server that is automatically replaced if it is stopped,
deleted, or goes into an error state, using an Aodh alarm delivered to a
Zaqar queue that triggers a Mistral workflow. This may be either be used
standalone, or as the scaled unit of a scaling group. When using this from
inside another template, the 'root_stack_id' parameter should be passed to
indicate at which stack the stack update should commence after marking the
server as failed. This should be the root-level stack, to ensure that any
other resources depending on outputs from this stack are also updated. Note
that this requires event alarms to be enabled in Aodh, following the
instructions at
https://docs.openstack.org/aodh/latest/contributor/event-alarm.html#configuration
(specifically, by adding the publisher "notifier://?topic=alarm.all" in
/etc/ceilometer/event_pipeline.yaml).
parameters:
flavor:
type: string
description: Flavor for the instances to be created
default: cirros256
constraints:
- custom_constraint: nova.flavor
description: Must be a flavor known to Nova
image:
type: string
description: >
Name or ID of the image to use for the instances.
default: cirros-0.3.4-x86_64-uec
constraints:
- custom_constraint: glance.image
description: Must identify an image known to Glance
network:
type: string
description: The network for the VM
default: private
port:
type: number
description: The port to reply to requests on
default: 8080
root_stack_id:
type: string
default: ""
conditions:
is_standalone: {equals: [{get_param: root_stack_id}, ""]}
resources:
server:
type: OS::Nova::Server
properties:
image: {get_param: image}
flavor: {get_param: flavor}
networks:
- network: {get_param: network}
user_data_format: RAW
user_data:
str_replace:
template: |
#! /bin/sh -v
Body=$(hostname)
Response="HTTP/1.1 200 OK\r\nContent-Length: ${#Body}\r\n\r\n$Body"
while true ; do echo -e $Response | nc -llp %PORT%; done
params:
"%PORT%": {get_param: port}
alarm_queue:
type: OS::Zaqar::Queue
stop_event_alarm:
type: OS::Aodh::EventAlarm
properties:
event_type: compute.instance.update
query:
- field: traits.instance_id
value: {get_resource: server}
op: eq
- field: traits.state
value: stopped
op: eq
alarm_queues:
- {get_resource: alarm_queue}
error_event_alarm:
type: OS::Aodh::EventAlarm
properties:
event_type: compute.instance.update
query:
- field: traits.instance_id
value: {get_resource: server}
op: eq
- field: traits.state
value: error
op: eq
alarm_queues:
- {get_resource: alarm_queue}
deleted_event_alarm:
type: OS::Aodh::EventAlarm
properties:
event_type: compute.instance.delete.start
query:
- field: traits.instance_id
value: {get_resource: server}
op: eq
alarm_queues:
- {get_resource: alarm_queue}
# The Aodh event alarm does not take effect immediately; it may take up to
# 60s (by default) for the event_alarm_cache_ttl to expire and the tenant's
# alarm data to be loaded. This resource ensures the stack is not completed
# until the alarm is active. See https://bugs.launchpad.net/aodh/+bug/1651273
alarm_cache_wait:
type: OS::Heat::TestResource
properties:
action_wait_secs:
create: 60
update: 60
value:
list_join:
- ''
- - {get_attr: [stop_event_alarm, show]}
- {get_attr: [error_event_alarm, show]}
- {get_attr: [deleted_event_alarm, show]}
alarm_subscription:
type: OS::Zaqar::MistralTrigger
properties:
queue_name: {get_resource: alarm_queue}
workflow_id: {get_resource: autoheal}
input:
stack_id: {get_param: "OS::stack_id"}
root_stack_id:
if:
- is_standalone
- {get_param: "OS::stack_id"}
- {get_param: "root_stack_id"}
autoheal:
type: OS::Mistral::Workflow
properties:
description: >
Mark a server as unhealthy and commence a stack update to replace it.
input:
stack_id:
root_stack_id:
type: direct
tasks:
- name: resources_mark_unhealthy
action:
list_join:
- ' '
- - heat.resources_mark_unhealthy
- stack_id=<% $.stack_id %>
- resource_name=<% env().notification.body.reason_data.event.traits.where($[0] = 'instance_id').select($[2]).first() %>
- mark_unhealthy=true
- resource_status_reason='Marked by alarm'
on_success:
- stacks_update
- name: stacks_update
action: heat.stacks_update stack_id=<% $.root_stack_id %> existing=true
outputs:
OS::stack_id:
description: The server UUID
value: {get_resource: server}
condition: {not: is_standalone}
first_address:
description: The server IP address
value: {get_attr: [server, first_address]}