heat-templates/hot/autohealing/autohealing_server.yaml

heat_template_version: 2017-02-24

description: >
  A stack containing a server that is automatically replaced if it is stopped,
  deleted, or goes into an error state, using an Aodh alarm delivered to a
  Zaqar queue that triggers a Mistral workflow. This may be either be used
  standalone, or as the scaled unit of a scaling group. When using this from
  inside another template, the 'root_stack_id' parameter should be passed to
  indicate at which stack the stack update should commence after marking the
  server as failed. This should be the root-level stack, to ensure that any
  other resources depending on outputs from this stack are also updated. Note
  that this requires event alarms to be enabled in Aodh, following the
  instructions at
  https://docs.openstack.org/aodh/latest/contributor/event-alarm.html#configuration
  (specifically, by adding the publisher "notifier://?topic=alarm.all" in
  /etc/ceilometer/event_pipeline.yaml).

parameters:
  flavor:
    type: string
    description: Flavor for the instances to be created
    default: cirros256
    constraints:
      - custom_constraint: nova.flavor
        description: Must be a flavor known to Nova
  image:
    type: string
    description: >
      Name or ID of the image to use for the instances.
    default: cirros-0.3.4-x86_64-uec
    constraints:
      - custom_constraint: glance.image
        description: Must identify an image known to Glance
  network:
    type: string
    description: The network for the VM
    default: private
  port:
    type: number
    description: The port to reply to requests on
    default: 8080
  root_stack_id:
    type: string
    default: ""

conditions:
  is_standalone: {equals: [{get_param: root_stack_id}, ""]}

resources:
  server:
    type: OS::Nova::Server
    properties:
      image: {get_param: image}
      flavor: {get_param: flavor}
      networks:
       - network: {get_param: network}
      user_data_format: RAW
      user_data:
        str_replace:
          template: |
            #! /bin/sh -v
            Body=$(hostname)
            Response="HTTP/1.1 200 OK\r\nContent-Length: ${#Body}\r\n\r\n$Body"
            while true ; do echo -e $Response | nc -llp %PORT%; done
          params:
            "%PORT%": {get_param: port}

  alarm_queue:
    type: OS::Zaqar::Queue

  stop_event_alarm:
    type: OS::Aodh::EventAlarm
    properties:
      event_type: compute.instance.update
      query:
        - field: traits.instance_id
          value: {get_resource: server}
          op: eq
        - field: traits.state
          value: stopped
          op: eq
      alarm_queues:
       - {get_resource: alarm_queue}

  error_event_alarm:
    type: OS::Aodh::EventAlarm
    properties:
      event_type: compute.instance.update
      query:
        - field: traits.instance_id
          value: {get_resource: server}
          op: eq
        - field: traits.state
          value: error
          op: eq
      alarm_queues:
       - {get_resource: alarm_queue}

  deleted_event_alarm:
    type: OS::Aodh::EventAlarm
    properties:
      event_type: compute.instance.delete.start
      query:
        - field: traits.instance_id
          value: {get_resource: server}
          op: eq
      alarm_queues:
       - {get_resource: alarm_queue}

  # The Aodh event alarm does not take effect immediately; it may take up to
  # 60s (by default) for the event_alarm_cache_ttl to expire and the tenant's
  # alarm data to be loaded. This resource ensures the stack is not completed
  # until the alarm is active. See https://bugs.launchpad.net/aodh/+bug/1651273
  alarm_cache_wait:
    type: OS::Heat::TestResource
    properties:
      action_wait_secs:
        create: 60
        update: 60
      value:
        list_join:
          - ''
          - - {get_attr: [stop_event_alarm, show]}
            - {get_attr: [error_event_alarm, show]}
            - {get_attr: [deleted_event_alarm, show]}

  alarm_subscription:
    type: OS::Zaqar::MistralTrigger
    properties:
      queue_name: {get_resource: alarm_queue}
      workflow_id: {get_resource: autoheal}
      input:
        stack_id: {get_param: "OS::stack_id"}
        root_stack_id:
          if:
            - is_standalone
            - {get_param: "OS::stack_id"}
            - {get_param: "root_stack_id"}

  autoheal:
    type: OS::Mistral::Workflow
    properties:
      description: >
        Mark a server as unhealthy and commence a stack update to replace it.
      input:
        stack_id:
        root_stack_id:
      type: direct
      tasks:
        - name: resources_mark_unhealthy
          action:
            list_join:
              - ' '
              - - heat.resources_mark_unhealthy
                - stack_id=<% $.stack_id %>
                - resource_name=<% env().notification.body.reason_data.event.traits.where($[0] = 'instance_id').select($[2]).first() %>
                - mark_unhealthy=true
                - resource_status_reason='Marked by alarm'
          on_success:
            - stacks_update
        - name: stacks_update
          action: heat.stacks_update stack_id=<% $.root_stack_id %> existing=true

outputs:
  OS::stack_id:
    description: The server UUID
    value: {get_resource: server}
    condition: {not: is_standalone}
  first_address:
    description: The server IP address
    value: {get_attr: [server, first_address]}