From f91c7b237196ba25042d5b5e0d0284ba282dfe0d Mon Sep 17 00:00:00 2001 From: Ifat Afek Date: Thu, 28 Jun 2018 12:37:46 +0000 Subject: [PATCH] Support Vitrage resources in Heat The purpose is to automate the auto-healing process that involves external monitoring, Vitrage alarm deduction and Mistral workflow execution. Story: 2002684 Task: 22527 Change-Id: If66248e07a662a225799a2bd3fc88a31d1539021 --- doc/source/index.rst | 10 +- specs/stein/vitrage-resources.rst | 344 ++++++++++++++++++++++++++++++ 2 files changed, 353 insertions(+), 1 deletion(-) create mode 100644 specs/stein/vitrage-resources.rst diff --git a/doc/source/index.rst b/doc/source/index.rst index 8be9bc76..0986abb5 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -66,13 +66,21 @@ Queens specs/queens/* Rocky ------- +----- .. toctree:: :glob: :maxdepth: 1 specs/rocky/* +Stein +----- +.. toctree:: + :glob: + :maxdepth: 1 + + specs/stein/* + Backlog ------- .. toctree:: diff --git a/specs/stein/vitrage-resources.rst b/specs/stein/vitrage-resources.rst new file mode 100644 index 00000000..ef681327 --- /dev/null +++ b/specs/stein/vitrage-resources.rst @@ -0,0 +1,344 @@ +.. + This work is licensed under a Creative Commons Attribution 3.0 Unported + License. + + http://creativecommons.org/licenses/by/3.0/legalcode + +========================= +Support Vitrage Resources +========================= + +https://storyboard.openstack.org/#!/story/2002684 + +This Blueprint proposes to add support for Vitrage resources in Heat. +The purpose is to automate the auto-healing process that involves external +monitoring, Vitrage alarm deduction and Mistral workflow execution. + +Problem description +=================== + +Auto-healing a Heat stack when an instance is down is extremely important. +This use case is already handled when Nova sends a notification about the +instance state, Aodh raises an event alarm and as a result a Mistral healing +workflow is executed. + +However, there are cases where Nova is not aware about the real state of the +instance. One example is a network failure: a NIC that is down can result in no +network connectivity to certain instances, while their state in Nova remains +'Active'. We would like to support auto-healing in such cases as well. + +Proposed change +=============== + +An ``OS::Vitrage::Template`` resource will be added in Heat, under +heat/engine/resources/openstack/vitrage. + +Its role will be to create, based on the properties given in HOT template, +a Vitrage template with a condition->action scenario that will handle the +healing. For this purpose it will use a Vitrage ``template prototype`` yaml +file that will reside in the same directory. The template prototypes can be +reused, and very few of them will be needed in order to support most +self-healing use cases. + + +The VitrageTemplate resource will support use cases like: + +#. An external monitor detects a network failure +#. Vitrage is notified, and based on its topology-graph it identifies all + affected resources +#. If an instance that belongs to a Heat stack is affected, Vitrage will + execute a Mistral healing workflow + + +VitrageTemplate definition +-------------------------- + +.. code-block:: yaml + + resources: + name: + type: OS::Vitrage::Template + properties: + template_prototype: String + template_params: + description: String + ... + + +Properties: + + - template_prototype - filename of the Vitrage template prototype + - description - Description of the Vitrage template + - template_params - list of key/value parameters that are required for the + given template prototype + + +Example 1 - instance down alarm +------------------------------- + +If there is an 'Instance down' alarm on an instance, execute a Mistral healing +workflow on that instance. + +Hot Template +^^^^^^^^^^^^ + +.. code-block:: yaml + + resources: + execute_healing: + type: OS::Vitrage::Template + properties: + template_prototype: execute_healing_on_instance_down.yaml + template_params: + description: Execute Mistral healing workflow if instance is down + instance_alarm_name: Instance down + instance_id: {get_resource: server} + workflow_name: {get_resource: autoheal} + heat_stack_id: {get_param: "OS::stack_id"} + + +Vitrage Template Prototype +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**execute_healing_on_instance_down.yaml** + +.. code-block:: yaml + + metadata: + version: 3 + name: get_param(name) + description: get_param(description) + type: prototype + parameters: + instance_alarm_name: + description: Name of the alarm on the instance + instance_id: + description: Uuid of the instance to auto-heal + heat_stack_id: + description: Uuid of the Heat stack to auto-heal + workflow_name: + description: Name of the Mistral workflow to execute + definitions: + entities: + - entity: + category: ALARM + name: get_param(instance_alarm_name) + template_id: alarm + - entity: + category: RESOURCE + type: nova.instance + id: get_param(instance_id) + template_id: instance + relationships: + - relationship: + source: alarm + relationship_type: on + target: instance + template_id : alarm_on_instance + scenarios: + - scenario: + condition: alarm_on_instance + actions: + - action: + action_type: execute_mistral + properties: + workflow: get_param(workflow_name) + input: + instance_id: get_param(instance_id) + heat_stack_id: get_param(heat_stack_id) + + +Example 2 - host down alarm +--------------------------- + +If there is a 'Host down' alarm on a host, and the host contains the instance +that is defined in this template, execute a Mistral healing workflow on that +instance. + +This example is similar to the first one, just that it uses a more complex +Vitrage template that considers the host->instance relationship. It also +performs other actions, in addition to executing a Mistral healing workflow: + +- modify the states of host and the instances in Vitrage +- raise an alarm on the instance and mark the host alarm as its root cause +- notify Nova that the host and instance are down + +All the complexity resides in the reusable Vitrage template prototype, while +the Heat usage is simple and quite straight forward. + + +Hot Template +^^^^^^^^^^^^ + +.. code-block:: yaml + + resources: + execute_healing: + type: OS::Vitrage::Template + properties: + template_prototype: execute_healing_on_host_down.yaml + template_params: + description: Execute Mistral healing workflow if a host is down + host_alarm_name: Host down + instance_alarm_name: Instance down + instance_id: {get_resource: server} + workflow_name: {get_resource: autoheal} + heat_stack_id: {get_param: "OS::stack_id"} + + + +Vitrage Template Prototype +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**execute_healing_on_host_down.yaml** + +.. code-block:: yaml + + metadata: + version: 3 + name: get_param(name) + description: get_param(description) + type: prototype + parameters: + host_alarm_name: + description: Name of the alarm on the host + instance_id: + description: Uuid of the instance to auto-heal + heat_stack_id: + description: Uuid of the Heat stack to auto-heal + instance_alarm_name: + description: Name of the alarm to be created on the instance + instance_alarm_severity: + description: Severity of the alarm to be created on the instance + default: critical + host_state: + description: New state to be set for the host + default: ERROR + instance_state: + description: New state to be set for the instance + default: ERROR + workflow_name: + description: Name of the Mistral workflow to execute + definitions: + entities: + - entity: + category: ALARM + name: get_param(host_alarm_name) + template_id: host_alarm + - entity: + category: ALARM + name: get_param(instance_alarm_name) + template_id: instance_alarm + - entity: + category: RESOURCE + type: nova.host + template_id: host + - entity: + category: RESOURCE + type: nova.instance + id: get_param(instance_id) + template_id: instance + relationships: + - relationship: + source: host_alarm + relationship_type: on + target: host + template_id : alarm_on_host + - relationship: + source: host + relationship_type: contains + target: instance + template_id : host_contains_instance + - relationship: + source: instance_alarm + relationship_type: on + target: instance + template_id : alarm_on_instance + scenarios: + - scenario: + condition: alarm_on_host + actions: + - action: + action_type: set_state + action_target: + target: host + properties: + state: get_param(host_state) + - action: + action_type: mark_down + action_target: + target: host + - scenario: + condition: alarm_on_host and host_contains_instance + actions: + - action: + action_type: raise_alarm + action_target: + target: instance + properties: + alarm_name: get_param(instance_alarm_name) + severity: get_param(instance_alarm_severity) + - scenario: + condition: alarm_on_instance + actions: + - action: + action_type: execute_mistral + properties: + workflow: get_param(workflow_name) + input: + instance_id: get_param(instance_id) + heat_stack_id: get_param(heat_stack_id) + - action: + action_type: set_state + action_target: + target: instance + properties: + state: get_param(instance_state) + - action: + action_type: mark_down + action_target: + target: instance + - scenario: + condition: alarm_on_host and host_contains_instance and alarm_on_instance + actions: + - action: + action_type: add_causal_relationship + action_target: + source: host_alarm + target: instance_alarm + + +Alternatives +------------ + +None + +Implementation +============== + +Assignee(s) +----------- + +Primary assignee: + ifat_afek + +Milestones +---------- + +Target Milestone for completion: + stein-3 + +Work Items +---------- + +- Implement a Vitrage client plugin +- Implement the VitrageTemplate resource +- Add unit tests and tempest tests +- Add a HOT template example to heat-templates + + +Dependencies +============ + +Depends on Vitrage template prototypes implementation: +https://review.openstack.org/#/c/627861/