235 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			235 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #    Copyright 2016 Mirantis, Inc.
 | |
| #
 | |
| #    Licensed under the Apache License, Version 2.0 (the "License"); you may
 | |
| #    not use this file except in compliance with the License. You may obtain
 | |
| #    a copy of the License at
 | |
| #
 | |
| #         http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| #    Unless required by applicable law or agreed to in writing, software
 | |
| #    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 | |
| #    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 | |
| #    License for the specific language governing permissions and limitations
 | |
| #    under the License.
 | |
| 
 | |
| from fuelweb_test.helpers.decorators import log_snapshot_after_test
 | |
| from fuelweb_test import logger
 | |
| from proboscis import test
 | |
| 
 | |
| from stacklight_tests.toolchain import api
 | |
| 
 | |
| OKAY_STATUS = 0
 | |
| WARNING_STATUS = 1
 | |
| UNKNOWN_STATUS = 2
 | |
| CRITICAL_STATUS = 3
 | |
| DOWN_STATUS = 4
 | |
| 
 | |
| WARNING_PERCENT = 91
 | |
| CRITICAL_PERCENT = 96
 | |
| 
 | |
| RABBITMQ_DISK_WARNING_PERCENT = 99.99
 | |
| RABBITMQ_DISK_CRITICAL_PERCENT = 100
 | |
| RABBITMQ_MEMORY_WARNING_VALUE = 1.01
 | |
| RABBITMQ_MEMORY_CRITICAL_VALUE = 1.0001
 | |
| 
 | |
| 
 | |
| @test(groups=["plugins"])
 | |
| class TestToolchainAlarms(api.ToolchainApi):
 | |
|     """Class for testing built-in StackLight Collector alarms.
 | |
|     """
 | |
| 
 | |
|     def _check_filesystem_alarms(self, nailgun_node, filesystem, source,
 | |
|                                  filename, node_role, alarm_type="node"):
 | |
|         self.check_alarms(alarm_type, node_role, source,
 | |
|                           nailgun_node["hostname"], OKAY_STATUS)
 | |
|         with self.fuel_web.get_ssh_for_nailgun_node(nailgun_node) as remote:
 | |
|             self.remote_ops.fill_up_filesystem(
 | |
|                 remote, filesystem, WARNING_PERCENT, filename)
 | |
|             logger.info("Checking {}-warning alarm".format(source))
 | |
|             self.check_alarms(alarm_type, node_role, source,
 | |
|                               nailgun_node["hostname"], WARNING_STATUS)
 | |
|             self.remote_ops.clean_filesystem(remote, filename)
 | |
|             self.check_alarms(alarm_type, node_role,
 | |
|                               source, nailgun_node["hostname"], OKAY_STATUS)
 | |
|             self.remote_ops.fill_up_filesystem(
 | |
|                 remote, filesystem, CRITICAL_PERCENT, filename)
 | |
|             logger.info("Checking {}-critical alarm".format(source))
 | |
|             self.check_alarms(alarm_type, node_role, source,
 | |
|                               nailgun_node["hostname"], CRITICAL_STATUS)
 | |
|             self.remote_ops.clean_filesystem(remote, filename)
 | |
|             self.check_alarms(alarm_type, node_role, source,
 | |
|                               nailgun_node["hostname"], OKAY_STATUS)
 | |
| 
 | |
|     def _check_rabbit_mq_disk_alarms(self, controller, status, percent):
 | |
|         cmd = ("rabbitmqctl set_disk_free_limit $(df | grep /dev/dm-4 | "
 | |
|                "awk '{{ printf(\"%.0f\\n\", 1024 * ((($3 + $4) * "
 | |
|                "{percent} / 100) - $3))}}')")
 | |
|         self.check_alarms("service", "rabbitmq", "disk",
 | |
|                           controller["hostname"], OKAY_STATUS)
 | |
|         with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
 | |
|             default_value = remote.check_call(
 | |
|                 "rabbitmqctl environment | grep disk_free_limit | "
 | |
|                 "sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip()
 | |
|             remote.check_call(cmd.format(percent=percent))
 | |
|             self.check_alarms("service", "rabbitmq", "disk",
 | |
|                               controller["hostname"], status)
 | |
|             remote.check_call("rabbitmqctl set_disk_free_limit {}".format(
 | |
|                 default_value))
 | |
|             self.check_alarms("service", "rabbitmq", "disk",
 | |
|                               controller["hostname"], OKAY_STATUS)
 | |
| 
 | |
|     def _check_rabbit_mq_memory_alarms(self, controller, status, value):
 | |
|         cmd = "rabbitmqctl set_vm_memory_high_watermark absolute \"{memory}\""
 | |
|         self.check_alarms("service", "rabbitmq", "memory",
 | |
|                           controller["hostname"], OKAY_STATUS)
 | |
|         with self.fuel_web.get_ssh_for_nailgun_node(controller) as remote:
 | |
|             default_value = remote.check_call(
 | |
|                 "rabbitmqctl environment | grep disk_free_limit | "
 | |
|                 "sed -r 's/}.+//' | sed 's|.*,||'")['stdout'][0].rstrip()
 | |
|             mem_usage = self.get_rabbitmq_memory_usage()
 | |
|             remote.check_call(cmd.format(memory=int(mem_usage * value)))
 | |
|             self.check_alarms("service", "rabbitmq", "memory",
 | |
|                               controller["hostname"], status)
 | |
|             self.set_rabbitmq_memory_watermark(controller, default_value)
 | |
|             self.check_alarms("service", "rabbitmq", "memory",
 | |
|                               controller["hostname"], OKAY_STATUS)
 | |
| 
 | |
|     @test(depends_on_groups=["deploy_toolchain"],
 | |
|           groups=["check_mysql_fs_alarms", "toolchain", "alarms"])
 | |
|     @log_snapshot_after_test
 | |
|     def check_mysql_fs_alarms(self):
 | |
|         """Check that mysql-fs-warning and mysql-fs-critical alarms work as
 | |
|         expected.
 | |
| 
 | |
|         Scenario:
 | |
|             1. Fill up /var/lib/mysql filesystem to 91 percent.
 | |
|             2. Check the last value of the warning alarm in InfluxDB.
 | |
|             3. Clean the filesystem.
 | |
|             4. Fill up /var/lib/mysql filesystem to 96 percent.
 | |
|             5. Check the last value of the critical alarm in InfluxDB.
 | |
|             6. Clean the filesystem.
 | |
| 
 | |
|         Duration 10m
 | |
|         """
 | |
|         self.env.revert_snapshot("deploy_toolchain")
 | |
|         controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
 | |
|             self.helpers.cluster_id, ["controller"])[0]
 | |
|         self._check_filesystem_alarms(
 | |
|             controller, "/dev/mapper/mysql-root", "mysql-fs",
 | |
|             "/var/lib/mysql/test/bigfile", "mysql-nodes")
 | |
| 
 | |
|     @test(depends_on_groups=["deploy_toolchain"],
 | |
|           groups=["check_rabbitmq_disk_alarm", "toolchain", "alarms"])
 | |
|     @log_snapshot_after_test
 | |
|     def check_rabbitmq_disk_alarm(self):
 | |
|         """Check that rabbitmq-disk-limit-warning and
 | |
|         rabbitmq-disk-limit-critical alarms work as expected.
 | |
| 
 | |
|         Scenario:
 | |
|             1. Check the last value of the okay alarm in InfluxDB.
 | |
|             2. Set RabbitMQ disk limit to 99.99 percent of available space.
 | |
|             3. Check the last value of the warning alarm in InfluxDB.
 | |
|             4. Set RabbitMQ disk limit to the default value.
 | |
|             5. Check the last value of the okay alarm in InfluxDB.
 | |
|             6. Set RabbitMQ disk limit to 100 percent of available space.
 | |
|             7. Check the last value of the critical alarm in InfluxDB.
 | |
|             8. Set RabbitMQ disk limit to the default value.
 | |
|             9. Check the last value of the okay alarm in InfluxDB.
 | |
| 
 | |
|         Duration 10m
 | |
|         """
 | |
|         self.env.revert_snapshot("deploy_toolchain")
 | |
|         controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
 | |
|             self.helpers.cluster_id, ["controller"])[0]
 | |
|         self._check_rabbit_mq_disk_alarms(controller, WARNING_STATUS,
 | |
|                                           RABBITMQ_DISK_WARNING_PERCENT)
 | |
|         self._check_rabbit_mq_disk_alarms(controller, CRITICAL_STATUS,
 | |
|                                           RABBITMQ_DISK_CRITICAL_PERCENT)
 | |
| 
 | |
|     @test(depends_on_groups=["deploy_toolchain"],
 | |
|           groups=["check_rabbitmq_memory_alarm", "toolchain",
 | |
|                   "alarms"])
 | |
|     @log_snapshot_after_test
 | |
|     def check_rabbitmq_memory_alarm(self):
 | |
|         """Check that rabbitmq-memory-limit-warning and
 | |
|         rabbitmq-memory-limit-critical alarms work as expected.
 | |
| 
 | |
|         Scenario:
 | |
|             1. Check the last value of the okay alarm in InfluxDB.
 | |
|             2. Set RabbitMQ memory limit to 101 percent of currently
 | |
|             used memory.
 | |
|             3. Check the last value of the warning alarm in InfluxDB.
 | |
|             4. Set RabbitMQ memory limit to the default value.
 | |
|             5. Check the last value of the okay alarm in InfluxDB.
 | |
|             6. Set RabbitMQ memory limit to 100.01 percent of currently
 | |
|             used memory.
 | |
|             7. Check the last value of the critical alarm in InfluxDB.
 | |
|             8. Set RabbitMQ memory limit to the default value.
 | |
|             9. Check the last value of the okay alarm in InfluxDB.
 | |
| 
 | |
|         Duration 10m
 | |
|         """
 | |
|         self.env.revert_snapshot("deploy_toolchain")
 | |
|         controller = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
 | |
|             self.helpers.cluster_id, ["controller"])[0]
 | |
|         self._check_rabbit_mq_memory_alarms(controller, WARNING_STATUS,
 | |
|                                             RABBITMQ_MEMORY_WARNING_VALUE)
 | |
|         self._check_rabbit_mq_memory_alarms(controller, CRITICAL_STATUS,
 | |
|                                             RABBITMQ_MEMORY_CRITICAL_VALUE)
 | |
| 
 | |
|     @test(depends_on_groups=["deploy_ha_toolchain"],
 | |
|           groups=["check_rabbitmq_pacemaker_alarms", "toolchain", "alarms"])
 | |
|     @log_snapshot_after_test
 | |
|     def check_rabbitmq_pacemaker_alarms(self):
 | |
|         """Check that rabbitmq-pacemaker-* alarms work as expected.
 | |
| 
 | |
|         Scenario:
 | |
|             1. Stop one slave RabbitMQ instance.
 | |
|             2. Check that the status of the RabbitMQ cluster is warning.
 | |
|             3. Stop the second slave RabbitMQ instance.
 | |
|             4. Check that the status of the RabbitMQ cluster is critical.
 | |
|             5. Stop the master RabbitMQ instance.
 | |
|             6. Check that the status of the RabbitMQ cluster is down.
 | |
|             7. Clear the RabbitMQ resource.
 | |
|             8. Check that the status of the RabbitMQ cluster is okay.
 | |
| 
 | |
|         Duration 10m
 | |
|         """
 | |
|         def ban_and_check_status(node, status, wait=None):
 | |
|             with self.fuel_web.get_ssh_for_node(node.name) as remote:
 | |
|                 logger.info("Ban rabbitmq resource on {}".format(node.name))
 | |
|                 self.remote_ops.ban_resource(remote,
 | |
|                                              'master_p_rabbitmq-server',
 | |
|                                              wait=wait)
 | |
|             self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
 | |
|                               None, status)
 | |
| 
 | |
|         self.env.revert_snapshot("deploy_ha_toolchain")
 | |
| 
 | |
|         self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
 | |
|                           None, OKAY_STATUS)
 | |
| 
 | |
|         controllers = self.fuel_web.get_nailgun_cluster_nodes_by_roles(
 | |
|             self.helpers.cluster_id, ["controller"])
 | |
| 
 | |
|         controller = controllers[0]
 | |
|         controller_node = self.fuel_web.get_devops_node_by_nailgun_node(
 | |
|             controller)
 | |
|         rabbitmq_master = self.fuel_web.get_rabbit_master_node(
 | |
|             controller_node.name)
 | |
|         rabbitmq_slaves = self.fuel_web.get_rabbit_slaves_node(
 | |
|             controller_node.name)
 | |
|         ban_and_check_status(rabbitmq_slaves[0], WARNING_STATUS, 120)
 | |
|         ban_and_check_status(rabbitmq_slaves[1], CRITICAL_STATUS, 120)
 | |
|         # Don't wait for the pcs operation to complete as it will fail since
 | |
|         # the resource isn't running anywhere
 | |
|         ban_and_check_status(rabbitmq_master, DOWN_STATUS)
 | |
| 
 | |
|         logger.info("Clear rabbitmq resource")
 | |
|         with self.fuel_web.get_ssh_for_node(rabbitmq_master.name) as remote:
 | |
|             self.remote_ops.clear_resource(remote,
 | |
|                                            'master_p_rabbitmq-server',
 | |
|                                            wait=240)
 | |
|         self.check_alarms('service', 'rabbitmq-cluster', 'pacemaker',
 | |
|                           None, OKAY_STATUS)
 | 
