OpenStack Orchestration (Heat) Templates
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

autohealing_server.yaml 5.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. heat_template_version: 2017-02-24
  2. description: >
  3. A stack containing a server that is automatically replaced if it is stopped,
  4. deleted, or goes into an error state, using an Aodh alarm delivered to a
  5. Zaqar queue that triggers a Mistral workflow. This may be either be used
  6. standalone, or as the scaled unit of a scaling group. When using this from
  7. inside another template, the 'root_stack_id' parameter should be passed to
  8. indicate at which stack the stack update should commence after marking the
  9. server as failed. This should be the root-level stack, to ensure that any
  10. other resources depending on outputs from this stack are also updated. Note
  11. that this requires event alarms to be enabled in Aodh, following the
  12. instructions at
  13. https://docs.openstack.org/aodh/latest/contributor/event-alarm.html#configuration
  14. (specifically, by adding the publisher "notifier://?topic=alarm.all" in
  15. /etc/ceilometer/event_pipeline.yaml).
  16. parameters:
  17. flavor:
  18. type: string
  19. description: Flavor for the instances to be created
  20. default: cirros256
  21. constraints:
  22. - custom_constraint: nova.flavor
  23. description: Must be a flavor known to Nova
  24. image:
  25. type: string
  26. description: >
  27. Name or ID of the image to use for the instances.
  28. default: cirros-0.3.4-x86_64-uec
  29. constraints:
  30. - custom_constraint: glance.image
  31. description: Must identify an image known to Glance
  32. network:
  33. type: string
  34. description: The network for the VM
  35. default: private
  36. port:
  37. type: number
  38. description: The port to reply to requests on
  39. default: 8080
  40. root_stack_id:
  41. type: string
  42. default: ""
  43. conditions:
  44. is_standalone: {equals: [{get_param: root_stack_id}, ""]}
  45. resources:
  46. server:
  47. type: OS::Nova::Server
  48. properties:
  49. image: {get_param: image}
  50. flavor: {get_param: flavor}
  51. networks:
  52. - network: {get_param: network}
  53. user_data_format: RAW
  54. user_data:
  55. str_replace:
  56. template: |
  57. #! /bin/sh -v
  58. Body=$(hostname)
  59. Response="HTTP/1.1 200 OK\r\nContent-Length: ${#Body}\r\n\r\n$Body"
  60. while true ; do echo -e $Response | nc -llp %PORT%; done
  61. params:
  62. "%PORT%": {get_param: port}
  63. alarm_queue:
  64. type: OS::Zaqar::Queue
  65. stop_event_alarm:
  66. type: OS::Aodh::EventAlarm
  67. properties:
  68. event_type: compute.instance.update
  69. query:
  70. - field: traits.instance_id
  71. value: {get_resource: server}
  72. op: eq
  73. - field: traits.state
  74. value: stopped
  75. op: eq
  76. alarm_queues:
  77. - {get_resource: alarm_queue}
  78. error_event_alarm:
  79. type: OS::Aodh::EventAlarm
  80. properties:
  81. event_type: compute.instance.update
  82. query:
  83. - field: traits.instance_id
  84. value: {get_resource: server}
  85. op: eq
  86. - field: traits.state
  87. value: error
  88. op: eq
  89. alarm_queues:
  90. - {get_resource: alarm_queue}
  91. deleted_event_alarm:
  92. type: OS::Aodh::EventAlarm
  93. properties:
  94. event_type: compute.instance.delete.start
  95. query:
  96. - field: traits.instance_id
  97. value: {get_resource: server}
  98. op: eq
  99. alarm_queues:
  100. - {get_resource: alarm_queue}
  101. # The Aodh event alarm does not take effect immediately; it may take up to
  102. # 60s (by default) for the event_alarm_cache_ttl to expire and the tenant's
  103. # alarm data to be loaded. This resource ensures the stack is not completed
  104. # until the alarm is active. See https://bugs.launchpad.net/aodh/+bug/1651273
  105. alarm_cache_wait:
  106. type: OS::Heat::TestResource
  107. properties:
  108. action_wait_secs:
  109. create: 60
  110. update: 60
  111. value:
  112. list_join:
  113. - ''
  114. - - {get_attr: [stop_event_alarm, show]}
  115. - {get_attr: [error_event_alarm, show]}
  116. - {get_attr: [deleted_event_alarm, show]}
  117. alarm_subscription:
  118. type: OS::Zaqar::MistralTrigger
  119. properties:
  120. queue_name: {get_resource: alarm_queue}
  121. workflow_id: {get_resource: autoheal}
  122. input:
  123. stack_id: {get_param: "OS::stack_id"}
  124. root_stack_id:
  125. if:
  126. - is_standalone
  127. - {get_param: "OS::stack_id"}
  128. - {get_param: "root_stack_id"}
  129. autoheal:
  130. type: OS::Mistral::Workflow
  131. properties:
  132. description: >
  133. Mark a server as unhealthy and commence a stack update to replace it.
  134. input:
  135. stack_id:
  136. root_stack_id:
  137. type: direct
  138. tasks:
  139. - name: resources_mark_unhealthy
  140. action:
  141. list_join:
  142. - ' '
  143. - - heat.resources_mark_unhealthy
  144. - stack_id=<% $.stack_id %>
  145. - resource_name=<% env().notification.body.reason_data.event.traits.where($[0] = 'instance_id').select($[2]).first() %>
  146. - mark_unhealthy=true
  147. - resource_status_reason='Marked by alarm'
  148. on_success:
  149. - stacks_update
  150. - name: stacks_update
  151. action: heat.stacks_update stack_id=<% $.root_stack_id %> existing=true
  152. outputs:
  153. OS::stack_id:
  154. description: The server UUID
  155. value: {get_resource: server}
  156. condition: {not: is_standalone}
  157. first_address:
  158. description: The server IP address
  159. value: {get_attr: [server, first_address]}