OpenStack Orchestration (Heat) Templates
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

autohealing_server.yaml 5.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. heat_template_version: 2017-02-24
  2. description: >
  3. A stack containing a server that is automatically replaced if it is stopped,
  4. deleted, or goes into an error state, using an Aodh alarm delivered to a
  5. Zaqar queue that triggers a Mistral workflow. This may be either be used
  6. standalone, or as the scaled unit of a scaling group. When using this from
  7. inside another template, the 'root_stack_id' parameter should be passed to
  8. indicate at which stack the stack update should commence after marking the
  9. server as failed. This should be the root-level stack, to ensure that any
  10. other resources depending on outputs from this stack are also updated. Note
  11. that this requires event alarms to be enabled in Aodh, following the
  12. instructions at http://docs.openstack.org/developer/aodh/event-alarm.html -
  13. specifically, by adding the publisher "notifier://?topic=alarm.all" in
  14. /etc/ceilometer/event_pipeline.yaml.
  15. parameters:
  16. flavor:
  17. type: string
  18. description: Flavor for the instances to be created
  19. default: cirros256
  20. constraints:
  21. - custom_constraint: nova.flavor
  22. description: Must be a flavor known to Nova
  23. image:
  24. type: string
  25. description: >
  26. Name or ID of the image to use for the instances.
  27. default: cirros-0.3.4-x86_64-uec
  28. constraints:
  29. - custom_constraint: glance.image
  30. description: Must identify an image known to Glance
  31. network:
  32. type: string
  33. description: The network for the VM
  34. default: private
  35. port:
  36. type: number
  37. description: The port to reply to requests on
  38. default: 8080
  39. root_stack_id:
  40. type: string
  41. default: ""
  42. conditions:
  43. is_standalone: {equals: [{get_param: root_stack_id}, ""]}
  44. resources:
  45. server:
  46. type: OS::Nova::Server
  47. properties:
  48. image: {get_param: image}
  49. flavor: {get_param: flavor}
  50. networks:
  51. - network: {get_param: network}
  52. user_data_format: RAW
  53. user_data:
  54. str_replace:
  55. template: |
  56. #! /bin/sh -v
  57. Body=$(hostname)
  58. Response="HTTP/1.1 200 OK\r\nContent-Length: ${#Body}\r\n\r\n$Body"
  59. while true ; do echo -e $Response | nc -llp %PORT%; done
  60. params:
  61. "%PORT%": {get_param: port}
  62. alarm_queue:
  63. type: OS::Zaqar::Queue
  64. stop_event_alarm:
  65. type: OS::Aodh::EventAlarm
  66. properties:
  67. event_type: compute.instance.update
  68. query:
  69. - field: traits.instance_id
  70. value: {get_resource: server}
  71. op: eq
  72. - field: traits.state
  73. value: stopped
  74. op: eq
  75. alarm_queues:
  76. - {get_resource: alarm_queue}
  77. error_event_alarm:
  78. type: OS::Aodh::EventAlarm
  79. properties:
  80. event_type: compute.instance.update
  81. query:
  82. - field: traits.instance_id
  83. value: {get_resource: server}
  84. op: eq
  85. - field: traits.state
  86. value: error
  87. op: eq
  88. alarm_queues:
  89. - {get_resource: alarm_queue}
  90. deleted_event_alarm:
  91. type: OS::Aodh::EventAlarm
  92. properties:
  93. event_type: compute.instance.delete.start
  94. query:
  95. - field: traits.instance_id
  96. value: {get_resource: server}
  97. op: eq
  98. alarm_queues:
  99. - {get_resource: alarm_queue}
  100. # The Aodh event alarm does not take effect immediately; it may take up to
  101. # 60s (by default) for the event_alarm_cache_ttl to expire and the tenant's
  102. # alarm data to be loaded. This resource ensures the stack is not completed
  103. # until the alarm is active. See https://bugs.launchpad.net/aodh/+bug/1651273
  104. alarm_cache_wait:
  105. type: OS::Heat::TestResource
  106. properties:
  107. action_wait_secs:
  108. create: 60
  109. update: 60
  110. value:
  111. list_join:
  112. - ''
  113. - - {get_attr: [stop_event_alarm, show]}
  114. - {get_attr: [error_event_alarm, show]}
  115. - {get_attr: [deleted_event_alarm, show]}
  116. alarm_subscription:
  117. type: OS::Zaqar::MistralTrigger
  118. properties:
  119. queue_name: {get_resource: alarm_queue}
  120. workflow_id: {get_resource: autoheal}
  121. input:
  122. stack_id: {get_param: "OS::stack_id"}
  123. root_stack_id:
  124. if:
  125. - is_standalone
  126. - {get_param: "OS::stack_id"}
  127. - {get_param: "root_stack_id"}
  128. autoheal:
  129. type: OS::Mistral::Workflow
  130. properties:
  131. description: >
  132. Mark a server as unhealthy and commence a stack update to replace it.
  133. input:
  134. stack_id:
  135. root_stack_id:
  136. type: direct
  137. tasks:
  138. - name: resources_mark_unhealthy
  139. action:
  140. list_join:
  141. - ' '
  142. - - heat.resources_mark_unhealthy
  143. - stack_id=<% $.stack_id %>
  144. - resource_name=<% env().notification.body.reason_data.event.traits.where($[0] = 'instance_id').select($[2]).first() %>
  145. - mark_unhealthy=true
  146. - resource_status_reason='Marked by alarm'
  147. on_success:
  148. - stacks_update
  149. - name: stacks_update
  150. action: heat.stacks_update stack_id=<% $.root_stack_id %> existing=true
  151. outputs:
  152. OS::stack_id:
  153. description: The server UUID
  154. value: {get_resource: server}
  155. condition: {not: is_standalone}
  156. first_address:
  157. description: The server IP address
  158. value: {get_attr: [server, first_address]}