Merge "Add test for RabbitMQ cluster alarms"

2016-09-02 13:57:38 +00:00
parent b0d0ba18f8 2fd575a53f
commit 6b41a46fad
3 changed files with 108 additions and 9 deletions
--- a/stacklight_tests/helpers/remote_ops.py
+++ b/stacklight_tests/helpers/remote_ops.py
@@ -83,6 +83,38 @@ def get_pids_of_process(remote, name):
    return result['stdout'][0].strip().split()
 def ban_resource(remote, resource, wait=None):
    """Ban a resource from the current node.
        :param remote: SSH connection to the node.
        :type remote: SSHClient
        :param resource: resource name.
        :type name: str
        :param wait: number of seconds to wait for the operation to complete.
        :type operation: int
    """
    cmd = "pcs resource ban {}".format(resource)
    if wait is not None:
        cmd = "{} --wait={}".format(cmd, wait)
    remote.check_call(cmd)
 def clear_resource(remote, resource, wait=None):
    """Clear a resource.
        :param remote: SSH connection to the node.
        :type remote: SSHClient
        :param resource: resource name.
        :type name: str
        :param wait: number of seconds to wait for the operation to complete.
        :type operation: int
    """
    cmd = "pcs resource clear {}".format(resource)
    if wait is not None:
        cmd = "{} --wait={}".format(cmd, wait)
    remote.check_call(cmd)
 def manage_pacemaker_service(remote, name, operation="restart"):
    """Operate HA service on remote node.
--- a/stacklight_tests/toolchain/api.py
+++ b/stacklight_tests/toolchain/api.py
@@ -329,21 +329,28 @@ class ToolchainApi(object):
        filter_by = "node_role"
        if alarm_type == "service":
            filter_by = "service"
-        query = (
+        filters = [
-            "select last(value) from {select_from} where time >= {time}"
+            "time >= {}".format(time_interval),
-            " and source = '{source}' and {filter} and hostname = '{hostname}'"
+            "source = '{}'".format(source),
-            " and value = {value}".format(
+            "{} = '{}'".format(filter_by, filter_value),
-                select_from="{}_status".format(alarm_type), time=time_interval,
+            "value = {}".format(value)
-                source=source, hostname=hostname, value=value,
+        ]
-                filter="{} = '{}'".format(filter_by, filter_value)))
+        if hostname is not None:
            filters.append("hostname = '{}'".format(hostname))
        query = "select last(value) from {select_from} where {filters}".format(
                select_from="{}_status".format(alarm_type),
                filters=" and ".join(filters))
        logger.info("InfluxDB query: {}".format(query))
        def check_result():
            result = self.INFLUXDB_GRAFANA.do_influxdb_query(
                query=query).json()["results"][0]
            return len(result)
-        msg = ("Alarm with source {} and {} {} and value {} was"
+        msg = ("Alarm of type: {}: entity: {}, source:{}, hostname: {}, "
-               " not triggered".format(source, filter_by, filter_value, value))
+               "value: {} wasn't triggered".format(alarm_type, filter_value,
                                                   source, hostname, value))
        devops_helpers.wait(check_result, timeout=60 * 5,
                            interval=10, timeout_msg=msg)
--- a/stacklight_tests/toolchain/test_alarms.py
+++ b/stacklight_tests/toolchain/test_alarms.py
@@ -20,9 +20,13 @@ from stacklight_tests.toolchain import api
 OKAY_STATUS = 0
 WARNING_STATUS = 1
 UNKNOWN_STATUS = 2
 CRITICAL_STATUS = 3
 DOWN_STATUS = 4
 WARNING_PERCENT = 91
 CRITICAL_PERCENT = 96
 RABBITMQ_DISK_WARNING_PERCENT = 99.99
 RABBITMQ_DISK_CRITICAL_PERCENT = 100
 RABBITMQ_MEMORY_WARNING_VALUE = 1.01
@@ -172,3 +176,59 @@ class TestToolchainAlarms(api.ToolchainApi):
                                            RABBITMQ_MEMORY_WARNING_VALUE)
        self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS,
                                            RABBITMQ_MEMORY_CRITICAL_VALUE)
    @test(depends_on_groups=["deploy_ha_toolchain"],
          groups=["check_rabbitmq_pacemaker_alarms", "toolchain", "alarms"])
    @log_snapshot_after_test
    def check_rabbitmq_pacemaker_alarms(self):
        """Check that rabbitmq-pacemaker-* alarms work as expected.
        Scenario:
            1. Stop one slave RabbitMQ instance.
            2. Check that the status of the RabbitMQ cluster is warning.
            3. Stop the second slave RabbitMQ instance.
            4. Check that the status of the RabbitMQ cluster is critical.
            5. Stop the master RabbitMQ instance.
            6. Check that the status of the RabbitMQ cluster is down.
            7. Clear the RabbitMQ resource.
            8. Check that the status of the RabbitMQ cluster is okay.
        Duration 10m
        """
        def ban_and_check_status(node, status, wait=None):
            with self.fuel_web.get_ssh_for_node(node.name) as remote:
                logger.info("Ban rabbitmq resource on {}".format(node.name))
                self.remote_ops.ban_resource(remote,
                                             'master_p_rabbitmq-server',
                                             wait=wait)
            self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
                              None, status)
        self.env.revert_snapshot("deploy_ha_toolchain")
        self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
                          None, OKAY_STATUS)
        controllers = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
            self.helpers.cluster_id, ["controller"])
        controller = controllers[0]
        controller_node = self.fuel_web.get_devops_node_by_nailgun_node(
            controller)
        rabbitmq_master = self.fuel_web.get_rabbit_master_node(
            controller_node.name)
        rabbitmq_slaves = self.fuel_web.get_rabbit_slaves_node(
            controller_node.name)
        ban_and_check_status(rabbitmq_slaves[0], WARNING_STATUS, 120)
        ban_and_check_status(rabbitmq_slaves[1], CRITICAL_STATUS, 120)
        # Don't wait for the pcs operation to complete as it will fail since
        # the resource isn't running anywhere
        ban_and_check_status(rabbitmq_master, DOWN_STATUS)
        logger.info("Clear rabbitmq resource")
        with self.fuel_web.get_ssh_for_node(rabbitmq_master.name) as remote:
            self.remote_ops.clear_resource(remote,
                                           'master_p_rabbitmq-server',
                                           wait=240)
        self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
                          None, OKAY_STATUS)