From 223c950f457947869059fb254dbee7acbbf146d4 Mon Sep 17 00:00:00 2001 From: Timur Nurlygayanov Date: Sun, 22 Mar 2015 16:27:38 +0300 Subject: [PATCH] Added hardware reboot We need to control services via shaker and we also need to have an ability to manage power via ipmi / VM power control commands. --- dummy_shaker_agent.py | 4 +- rally-contexts/cloud_nodes_context.py | 8 ++- rally-contexts/recover_cloud_context.py | 68 +++++++++++++------ rally-scenarios/base_disaster_scenario.py | 30 +++++--- .../rabbitmq_disaster_scenarios.py | 47 ++++++------- samples/tasks/scenarios/SampleScenario.json | 32 ++++++++- 6 files changed, 128 insertions(+), 61 deletions(-) diff --git a/dummy_shaker_agent.py b/dummy_shaker_agent.py index 5d51651..649c3dd 100644 --- a/dummy_shaker_agent.py +++ b/dummy_shaker_agent.py @@ -10,9 +10,9 @@ def run_command(): r = request.get_json(force=True) process = subprocess.Popen(r["command"].split(), stdout=subprocess.PIPE) output = process.communicate()[0] - + return output if __name__ == '__main__': - app.run(host="0.0.0.0", debug=True) \ No newline at end of file + app.run(host="0.0.0.0", debug=True) diff --git a/rally-contexts/cloud_nodes_context.py b/rally-contexts/cloud_nodes_context.py index 893564e..fd76288 100644 --- a/rally-contexts/cloud_nodes_context.py +++ b/rally-contexts/cloud_nodes_context.py @@ -6,7 +6,7 @@ from rally import osclients LOG = logging.getLogger(__name__) -@base.context(name="cloud_nodes", order=1000) +@base.context(name="cloud_nodes", order=800) class CloudNodesContext(base.Context): """This context allows to define the list of nodes in the cloud""" @@ -18,6 +18,10 @@ class CloudNodesContext(base.Context): "controllers": { "type": "array", "default": [] + }, + "power_control_node": { + "type": "dictionary", + "default": {} } } } @@ -25,6 +29,8 @@ class CloudNodesContext(base.Context): def setup(self): """This method is called before the task start""" self.context["controllers"] = self.config.get("controllers", []) + power_control_node = self.config.get("power_control_node", {}) + self.context["power_control_node"] = power_control_node def cleanup(self): """This method is called after the task finish""" diff --git a/rally-contexts/recover_cloud_context.py b/rally-contexts/recover_cloud_context.py index 5ded4c6..f89384b 100644 --- a/rally-contexts/recover_cloud_context.py +++ b/rally-contexts/recover_cloud_context.py @@ -6,7 +6,7 @@ from rally import osclients LOG = logging.getLogger(__name__) -@base.context(name="recover_cloud", order=999) +@base.context(name="recover_cloud", order=900) class CloudNodesContext(base.Context): """This context allows to recover cloud after disaster tests""" @@ -14,32 +14,56 @@ class CloudNodesContext(base.Context): "type": "object", "$schema": consts.JSON_SCHEMA, "additionalProperties": False, - "properties": {} - } - - ACTIONS = { - "stop rabbitmq service": { - "do": "/etc/init.d/rabbitmq-server stop", - "undo": "/etc/init.d/rabbitmq-server start" - }, - "ban rabbitmq service with pcs": { - "do": "pcs resource ban rabbitmq", - "undo": "pcs resource clear rabbitmq" + "properties": { + "checks": { + "type": "array", + "default": [] + } } } + def check_rabbitmq_cluster_status(self, controllers): + command = "rabbitmqctl cluster_status" + + for controller in controllers: + output = self.run_command(controller["shaker_agent_id"], command) + for line in output.splitlines(): + if "nodes" in line and "running_nodes" not in line: + nodes = [node for node in line.split("'") + if "rabbit" in node] + if "running_nodes" in line: + active_nodes = [node for node in line.split("'") + if "rabbit" in node] + for node in nodes: + if node not in active_nodes: + return False + return True + + def run_command(self, node, command, recover_command=None, + recover_timeout=0): + if recover_cmd is not None: + action = {"node": node, "command": command, + "timeout": recover_timeout} + self.context["recover_commands"].append(action) + + r = requests.post("http://{0}/run_command".format(node), + headers={"Content-Type": "application/json"}, + data=json.dumps({"command": command})) + + return r.text + def setup(self): """This method is called before the task start""" - self.context["actions"] = self.ACTIONS - - # done_actions contains information about name of shaker_id - # and action name which were executed, example: - # self.context["done_actions"] = [{"name": "node-1", "command": "ls"}] - self.context["done_actions"] = [] + self.context["recover_commands"] = [] + self.context["checks"] = self.config.get("checks", []) def cleanup(self): """This method is called after the task finish""" - for action in self.context["done_actions"]: - ## we need to import shaker somehow :) - shaker.run_command_on_node(action["node"], - ACTIONS[action["command"]]["undo"]) + for action in self.context["recover_commands"]: + self.run_command(action["node"], action["command"]) + time.sleep(action.get("timeout", 0)) + + controllers = self.context["controllers"] + if "rabbitmq_cluster_status" in self.context["checks"]: + if self.check_rabbitmq_cluster_status(controllers) is False: + raise "RabbitMQ cluster wasn't recovered" diff --git a/rally-scenarios/base_disaster_scenario.py b/rally-scenarios/base_disaster_scenario.py index f569b49..a1e9620 100644 --- a/rally-scenarios/base_disaster_scenario.py +++ b/rally-scenarios/base_disaster_scenario.py @@ -1,5 +1,7 @@ import json import requests +import time + from rally.benchmark.scenarios import base @@ -11,22 +13,28 @@ class BaseDisasterScenario(base.Scenario): image=self.context["shaker_image"], flavor=self.context["default_flavor"], {"auto_assign_nic": True}) + return vm + + def run_command(self, node, command, recover_command=None): + if recover_cmd is not None: + action = {"node": node, "command": command} + self.context["recover_commands"].append(action) - def execute_command_on_shaker_node(self, node, command): - cmd = {"command": command} r = requests.post("http://{0}/run_command".format(node), headers={"Content-Type": "application/json"}, - data=json.dumps(cmd)) + data=json.dumps({"command": command})) return r.text - def run_command(self, node, command): - return self.execute_command_on_shaker_node(node, command) + def power_off_controller(self, controller_id): + control_node = self.context["power_control_node"] + controller = self.context["controllers"][controller_id] - def run_disaster_command(self, node, command): - do = self.context["actions"][command]["do"] + self.run_command(control_node["shaker_agent_id"], + command=controller["hardware_power_off_cmd"], + recover_command=controller["hardware_power_on_cmd"], + recover_timeout=controller["power_on_timeout"]) + time.sleep(controller["power_off_timeout"]) - done = {"node": node, "command": command} - self.context["done_actions"].append(done) - - self.execute_command_on_shaker_node(node, command) \ No newline at end of file + def power_off_main_controller(self): + pass diff --git a/rally-scenarios/rabbitmq_disaster_scenarios.py b/rally-scenarios/rabbitmq_disaster_scenarios.py index c83c1cf..a52772d 100644 --- a/rally-scenarios/rabbitmq_disaster_scenarios.py +++ b/rally-scenarios/rabbitmq_disaster_scenarios.py @@ -1,35 +1,36 @@ +import random import base_disaster_scenario from rally.benchmark.scenarios import base -class BaseDisasterScenario(base_disaster_scenario.BaseDisasterScenario): +class RabbitMQDisasterScenarios(base_disaster_scenario.BaseDisasterScenario): @base.scenario() - def test_rabbitmq_failover01(self): - """ Test Scenario: + def power_off_one_cantroller(self): + """ Poweroff one contoller and verify cloud - 1. Deploy OpenStack cloud with 3 controllers - 2. Stop RabbitMQ services on all controllers - 3. Start RabbitMQ on one controller - 4. Create VM 10 times, create networks, volumes, upload images, - create users and etc. - 5. Start all RabbitMQ services and repeat step #4 + Setup: + OpenStack cloud with at least 3 controllers + + Scenario: + 1. Poweroff one controller + 2. Verify cloud: create VM 10 times, create networks, + volumes, upload images """ - for i in xrange(0, 3): - self.run_disaster_command(self.context["controllers"][i], - "stop rabbitmq service") + controller_id = random.randint(0, len(self.context["controllers"])) + self.power_off_controller(controller_id) - self.run_command(self.context["controllers"][0], - "/etc/init.d/rabbitmq-server start") + vm_list = [] + for i in xrange(0, 10): + vm = self.boot_vm("test{0}".format(i)) + vm_list.append(vm) - # (tnurlygayanov): TODO: - # Need to write the functions which will verify that cloud - # works fine: create/delete several VMs, networks, images, - # volumes and etc. - if i in xrange(0, 10): - self.boot_vm("test{0}".format(i)) + timeout = 300 + active_vms = [] + while timeout > 0 and len(active_vms) < 10: + active_vms = [vm for vm in vm_list if vm.state == "ACTIVE"] + timeout -= 1 - for i in xrange(0, 3): - self.run_command(self.context["controllers"][i], - "/etc/init.d/rabbitmq-server start") \ No newline at end of file + if len(active_vms) < 10: + raise "Can't boot VMs" diff --git a/samples/tasks/scenarios/SampleScenario.json b/samples/tasks/scenarios/SampleScenario.json index 33b3697..1cf1fc1 100644 --- a/samples/tasks/scenarios/SampleScenario.json +++ b/samples/tasks/scenarios/SampleScenario.json @@ -1,13 +1,41 @@ { - "shaker_controller.sample_print": [ + "RabbitMQDisasterScenarios.power_off_one_cantroller": [ { "runner": { "type": "serial", "times": 5, }, "context": { + "recover_cloud": { + "checks": ["rabbitmq_cluster_status", ] + }, "cloud_nodes": { - "controllers": ["1", "2", "3"] + "controllers": [ + { + "shaker_agent_id": "1", + "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-1", + "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-1 poweroff", + "power_off_timeout": 20, + "power_on_timeout": 30 + }, + { + "shaker_agent_id": "2", + "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-2", + "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-2 poweroff", + "power_off_timeout": 20, + "power_on_timeout": 30 + }, + { + "shaker_agent_id": "3", + "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-3", + "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-3 poweroff", + "power_off_timeout": 20, + "power_on_timeout": 30 + } + ], + "power_control_node": { + "shaker_agent_id": "localhost" + } } } }