Merge "Add test for RabbitMQ cluster alarms"
This commit is contained in:
		| @@ -83,6 +83,38 @@ def get_pids_of_process(remote, name): | |||||||
|     return result['stdout'][0].strip().split() |     return result['stdout'][0].strip().split() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def ban_resource(remote, resource, wait=None): | ||||||
|  |     """Ban a resource from the current node. | ||||||
|  |  | ||||||
|  |         :param remote: SSH connection to the node. | ||||||
|  |         :type remote: SSHClient | ||||||
|  |         :param resource: resource name. | ||||||
|  |         :type name: str | ||||||
|  |         :param wait: number of seconds to wait for the operation to complete. | ||||||
|  |         :type operation: int | ||||||
|  |     """ | ||||||
|  |     cmd = "pcs resource ban {}".format(resource) | ||||||
|  |     if wait is not None: | ||||||
|  |         cmd = "{} --wait={}".format(cmd, wait) | ||||||
|  |     remote.check_call(cmd) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def clear_resource(remote, resource, wait=None): | ||||||
|  |     """Clear a resource. | ||||||
|  |  | ||||||
|  |         :param remote: SSH connection to the node. | ||||||
|  |         :type remote: SSHClient | ||||||
|  |         :param resource: resource name. | ||||||
|  |         :type name: str | ||||||
|  |         :param wait: number of seconds to wait for the operation to complete. | ||||||
|  |         :type operation: int | ||||||
|  |     """ | ||||||
|  |     cmd = "pcs resource clear {}".format(resource) | ||||||
|  |     if wait is not None: | ||||||
|  |         cmd = "{} --wait={}".format(cmd, wait) | ||||||
|  |     remote.check_call(cmd) | ||||||
|  |  | ||||||
|  |  | ||||||
| def manage_pacemaker_service(remote, name, operation="restart"): | def manage_pacemaker_service(remote, name, operation="restart"): | ||||||
|     """Operate HA service on remote node. |     """Operate HA service on remote node. | ||||||
|  |  | ||||||
|   | |||||||
| @@ -329,21 +329,28 @@ class ToolchainApi(object): | |||||||
|         filter_by = "node_role" |         filter_by = "node_role" | ||||||
|         if alarm_type == "service": |         if alarm_type == "service": | ||||||
|             filter_by = "service" |             filter_by = "service" | ||||||
|         query = ( |         filters = [ | ||||||
|             "select last(value) from {select_from} where time >= {time}" |             "time >= {}".format(time_interval), | ||||||
|             " and source = '{source}' and {filter} and hostname = '{hostname}'" |             "source = '{}'".format(source), | ||||||
|             " and value = {value}".format( |             "{} = '{}'".format(filter_by, filter_value), | ||||||
|                 select_from="{}_status".format(alarm_type), time=time_interval, |             "value = {}".format(value) | ||||||
|                 source=source, hostname=hostname, value=value, |         ] | ||||||
|                 filter="{} = '{}'".format(filter_by, filter_value))) |         if hostname is not None: | ||||||
|  |             filters.append("hostname = '{}'".format(hostname)) | ||||||
|  |  | ||||||
|  |         query = "select last(value) from {select_from} where {filters}".format( | ||||||
|  |                 select_from="{}_status".format(alarm_type), | ||||||
|  |                 filters=" and ".join(filters)) | ||||||
|  |         logger.info("InfluxDB query: {}".format(query)) | ||||||
|  |  | ||||||
|         def check_result(): |         def check_result(): | ||||||
|             result = self.INFLUXDB_GRAFANA.do_influxdb_query( |             result = self.INFLUXDB_GRAFANA.do_influxdb_query( | ||||||
|                 query=query).json()["results"][0] |                 query=query).json()["results"][0] | ||||||
|             return len(result) |             return len(result) | ||||||
|  |  | ||||||
|         msg = ("Alarm with source {} and {} {} and value {} was" |         msg = ("Alarm of type: {}: entity: {}, source:{}, hostname: {}, " | ||||||
|                " not triggered".format(source, filter_by, filter_value, value)) |                "value: {} wasn't triggered".format(alarm_type, filter_value, | ||||||
|  |                                                    source, hostname, value)) | ||||||
|         devops_helpers.wait(check_result, timeout=60 * 5, |         devops_helpers.wait(check_result, timeout=60 * 5, | ||||||
|                             interval=10, timeout_msg=msg) |                             interval=10, timeout_msg=msg) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -20,9 +20,13 @@ from stacklight_tests.toolchain import api | |||||||
|  |  | ||||||
| OKAY_STATUS = 0 | OKAY_STATUS = 0 | ||||||
| WARNING_STATUS = 1 | WARNING_STATUS = 1 | ||||||
|  | UNKNOWN_STATUS = 2 | ||||||
| CRITICAL_STATUS = 3 | CRITICAL_STATUS = 3 | ||||||
|  | DOWN_STATUS = 4 | ||||||
|  |  | ||||||
| WARNING_PERCENT = 91 | WARNING_PERCENT = 91 | ||||||
| CRITICAL_PERCENT = 96 | CRITICAL_PERCENT = 96 | ||||||
|  |  | ||||||
| RABBITMQ_DISK_WARNING_PERCENT = 99.99 | RABBITMQ_DISK_WARNING_PERCENT = 99.99 | ||||||
| RABBITMQ_DISK_CRITICAL_PERCENT = 100 | RABBITMQ_DISK_CRITICAL_PERCENT = 100 | ||||||
| RABBITMQ_MEMORY_WARNING_VALUE = 1.01 | RABBITMQ_MEMORY_WARNING_VALUE = 1.01 | ||||||
| @@ -172,3 +176,59 @@ class TestToolchainAlarms(api.ToolchainApi): | |||||||
|                                             RABBITMQ_MEMORY_WARNING_VALUE) |                                             RABBITMQ_MEMORY_WARNING_VALUE) | ||||||
|         self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS, |         self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS, | ||||||
|                                             RABBITMQ_MEMORY_CRITICAL_VALUE) |                                             RABBITMQ_MEMORY_CRITICAL_VALUE) | ||||||
|  |  | ||||||
|  |     @test(depends_on_groups=["deploy_ha_toolchain"], | ||||||
|  |           groups=["check_rabbitmq_pacemaker_alarms", "toolchain", "alarms"]) | ||||||
|  |     @log_snapshot_after_test | ||||||
|  |     def check_rabbitmq_pacemaker_alarms(self): | ||||||
|  |         """Check that rabbitmq-pacemaker-* alarms work as expected. | ||||||
|  |  | ||||||
|  |         Scenario: | ||||||
|  |             1. Stop one slave RabbitMQ instance. | ||||||
|  |             2. Check that the status of the RabbitMQ cluster is warning. | ||||||
|  |             3. Stop the second slave RabbitMQ instance. | ||||||
|  |             4. Check that the status of the RabbitMQ cluster is critical. | ||||||
|  |             5. Stop the master RabbitMQ instance. | ||||||
|  |             6. Check that the status of the RabbitMQ cluster is down. | ||||||
|  |             7. Clear the RabbitMQ resource. | ||||||
|  |             8. Check that the status of the RabbitMQ cluster is okay. | ||||||
|  |  | ||||||
|  |         Duration 10m | ||||||
|  |         """ | ||||||
|  |         def ban_and_check_status(node, status, wait=None): | ||||||
|  |             with self.fuel_web.get_ssh_for_node(node.name) as remote: | ||||||
|  |                 logger.info("Ban rabbitmq resource on {}".format(node.name)) | ||||||
|  |                 self.remote_ops.ban_resource(remote, | ||||||
|  |                                              'master_p_rabbitmq-server', | ||||||
|  |                                              wait=wait) | ||||||
|  |             self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker', | ||||||
|  |                               None, status) | ||||||
|  |  | ||||||
|  |         self.env.revert_snapshot("deploy_ha_toolchain") | ||||||
|  |  | ||||||
|  |         self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker', | ||||||
|  |                           None, OKAY_STATUS) | ||||||
|  |  | ||||||
|  |         controllers = self.fuel_web.get_nailgun_cluster_nodes_by_roles( | ||||||
|  |             self.helpers.cluster_id, ["controller"]) | ||||||
|  |  | ||||||
|  |         controller = controllers[0] | ||||||
|  |         controller_node = self.fuel_web.get_devops_node_by_nailgun_node( | ||||||
|  |             controller) | ||||||
|  |         rabbitmq_master = self.fuel_web.get_rabbit_master_node( | ||||||
|  |             controller_node.name) | ||||||
|  |         rabbitmq_slaves = self.fuel_web.get_rabbit_slaves_node( | ||||||
|  |             controller_node.name) | ||||||
|  |         ban_and_check_status(rabbitmq_slaves[0], WARNING_STATUS, 120) | ||||||
|  |         ban_and_check_status(rabbitmq_slaves[1], CRITICAL_STATUS, 120) | ||||||
|  |         # Don't wait for the pcs operation to complete as it will fail since | ||||||
|  |         # the resource isn't running anywhere | ||||||
|  |         ban_and_check_status(rabbitmq_master, DOWN_STATUS) | ||||||
|  |  | ||||||
|  |         logger.info("Clear rabbitmq resource") | ||||||
|  |         with self.fuel_web.get_ssh_for_node(rabbitmq_master.name) as remote: | ||||||
|  |             self.remote_ops.clear_resource(remote, | ||||||
|  |                                            'master_p_rabbitmq-server', | ||||||
|  |                                            wait=240) | ||||||
|  |         self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker', | ||||||
|  |                           None, OKAY_STATUS) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Jenkins
					Jenkins