From 223c950f457947869059fb254dbee7acbbf146d4 Mon Sep 17 00:00:00 2001
From: Timur Nurlygayanov <tnurlygayanov@mirantis.com>
Date: Sun, 22 Mar 2015 16:27:38 +0300
Subject: [PATCH] Added hardware reboot

We need to control services via shaker and we also need to have an
ability to manage power via ipmi / VM power control commands.
---
 dummy_shaker_agent.py                         |  4 +-
 rally-contexts/cloud_nodes_context.py         |  8 ++-
 rally-contexts/recover_cloud_context.py       | 68 +++++++++++++------
 rally-scenarios/base_disaster_scenario.py     | 30 +++++---
 .../rabbitmq_disaster_scenarios.py            | 47 ++++++-------
 samples/tasks/scenarios/SampleScenario.json   | 32 ++++++++-
 6 files changed, 128 insertions(+), 61 deletions(-)

diff --git a/dummy_shaker_agent.py b/dummy_shaker_agent.py
index 5d51651..649c3dd 100644
--- a/dummy_shaker_agent.py
+++ b/dummy_shaker_agent.py
@@ -10,9 +10,9 @@ def run_command():
     r = request.get_json(force=True)
     process = subprocess.Popen(r["command"].split(), stdout=subprocess.PIPE)
     output = process.communicate()[0]
-    
+
     return output
 
 
 if __name__ == '__main__':
-    app.run(host="0.0.0.0", debug=True)
\ No newline at end of file
+    app.run(host="0.0.0.0", debug=True)
diff --git a/rally-contexts/cloud_nodes_context.py b/rally-contexts/cloud_nodes_context.py
index 893564e..fd76288 100644
--- a/rally-contexts/cloud_nodes_context.py
+++ b/rally-contexts/cloud_nodes_context.py
@@ -6,7 +6,7 @@ from rally import osclients
 LOG = logging.getLogger(__name__)
 
 
-@base.context(name="cloud_nodes", order=1000)
+@base.context(name="cloud_nodes", order=800)
 class CloudNodesContext(base.Context):
     """This context allows to define the list of nodes in the cloud"""
 
@@ -18,6 +18,10 @@ class CloudNodesContext(base.Context):
             "controllers": {
                 "type": "array",
                 "default": []
+            },
+            "power_control_node": {
+                "type": "dictionary",
+                "default": {}
             }
         }
     }
@@ -25,6 +29,8 @@ class CloudNodesContext(base.Context):
     def setup(self):
         """This method is called before the task start"""
         self.context["controllers"] = self.config.get("controllers", [])
+        power_control_node = self.config.get("power_control_node", {})
+        self.context["power_control_node"] = power_control_node
 
     def cleanup(self):
         """This method is called after the task finish"""
diff --git a/rally-contexts/recover_cloud_context.py b/rally-contexts/recover_cloud_context.py
index 5ded4c6..f89384b 100644
--- a/rally-contexts/recover_cloud_context.py
+++ b/rally-contexts/recover_cloud_context.py
@@ -6,7 +6,7 @@ from rally import osclients
 LOG = logging.getLogger(__name__)
 
 
-@base.context(name="recover_cloud", order=999)
+@base.context(name="recover_cloud", order=900)
 class CloudNodesContext(base.Context):
     """This context allows to recover cloud after disaster tests"""
 
@@ -14,32 +14,56 @@ class CloudNodesContext(base.Context):
         "type": "object",
         "$schema": consts.JSON_SCHEMA,
         "additionalProperties": False,
-        "properties": {}
-    }
-
-    ACTIONS = {
-        "stop rabbitmq service": {
-            "do": "/etc/init.d/rabbitmq-server stop",
-            "undo": "/etc/init.d/rabbitmq-server start"
-        },
-        "ban rabbitmq service with pcs": {
-            "do": "pcs resource ban rabbitmq",
-            "undo": "pcs resource clear rabbitmq"
+        "properties": {
+            "checks": {
+                "type": "array",
+                "default": []
+            }
         }
     }
 
+    def check_rabbitmq_cluster_status(self, controllers):
+        command = "rabbitmqctl cluster_status"
+
+        for controller in controllers:
+            output = self.run_command(controller["shaker_agent_id"], command)
+            for line in output.splitlines():
+                if "nodes" in line and "running_nodes" not in line:
+                    nodes = [node for node in line.split("'")
+                             if "rabbit" in node]
+                if "running_nodes" in line:
+                    active_nodes = [node for node in line.split("'")
+                                    if "rabbit" in node]
+            for node in nodes:
+                if node not in active_nodes:
+                    return False
+        return True
+
+    def run_command(self, node, command, recover_command=None,
+                    recover_timeout=0):
+        if recover_cmd is not None:
+            action = {"node": node, "command": command,
+                      "timeout": recover_timeout}
+            self.context["recover_commands"].append(action)
+
+        r = requests.post("http://{0}/run_command".format(node),
+                          headers={"Content-Type": "application/json"},
+                          data=json.dumps({"command": command}))
+
+        return r.text
+
     def setup(self):
         """This method is called before the task start"""
-        self.context["actions"] = self.ACTIONS
-
-        # done_actions contains information about name of shaker_id
-        # and action name which were executed, example:
-        # self.context["done_actions"] = [{"name": "node-1", "command": "ls"}]
-        self.context["done_actions"] = []
+        self.context["recover_commands"] = []
+        self.context["checks"] = self.config.get("checks", [])
 
     def cleanup(self):
         """This method is called after the task finish"""
-        for action in self.context["done_actions"]:
-            ## we need to import shaker somehow :)
-            shaker.run_command_on_node(action["node"],
-                                        ACTIONS[action["command"]]["undo"])
+        for action in self.context["recover_commands"]:
+            self.run_command(action["node"], action["command"])
+            time.sleep(action.get("timeout", 0))
+
+        controllers = self.context["controllers"]
+        if "rabbitmq_cluster_status" in self.context["checks"]:
+            if self.check_rabbitmq_cluster_status(controllers) is False:
+                raise "RabbitMQ cluster wasn't recovered"
diff --git a/rally-scenarios/base_disaster_scenario.py b/rally-scenarios/base_disaster_scenario.py
index f569b49..a1e9620 100644
--- a/rally-scenarios/base_disaster_scenario.py
+++ b/rally-scenarios/base_disaster_scenario.py
@@ -1,5 +1,7 @@
 import json
 import requests
+import time
+
 from rally.benchmark.scenarios import base
 
 
@@ -11,22 +13,28 @@ class BaseDisasterScenario(base.Scenario):
                                  image=self.context["shaker_image"],
                                  flavor=self.context["default_flavor"],
                                  {"auto_assign_nic": True})
+        return vm
+
+    def run_command(self, node, command, recover_command=None):
+        if recover_cmd is not None:
+            action = {"node": node, "command": command}
+            self.context["recover_commands"].append(action)
 
-    def execute_command_on_shaker_node(self, node, command):
-        cmd = {"command": command}
         r = requests.post("http://{0}/run_command".format(node),
                           headers={"Content-Type": "application/json"},
-                          data=json.dumps(cmd))
+                          data=json.dumps({"command": command}))
 
         return r.text
 
-    def run_command(self, node, command):
-        return self.execute_command_on_shaker_node(node, command)
+    def power_off_controller(self, controller_id):
+        control_node = self.context["power_control_node"]
+        controller = self.context["controllers"][controller_id]
 
-    def run_disaster_command(self, node, command):
-        do = self.context["actions"][command]["do"]
+        self.run_command(control_node["shaker_agent_id"],
+                         command=controller["hardware_power_off_cmd"],
+                         recover_command=controller["hardware_power_on_cmd"],
+                         recover_timeout=controller["power_on_timeout"])
+        time.sleep(controller["power_off_timeout"])
 
-        done = {"node": node, "command": command}
-        self.context["done_actions"].append(done)
-
-        self.execute_command_on_shaker_node(node, command)
\ No newline at end of file
+    def power_off_main_controller(self):
+        pass
diff --git a/rally-scenarios/rabbitmq_disaster_scenarios.py b/rally-scenarios/rabbitmq_disaster_scenarios.py
index c83c1cf..a52772d 100644
--- a/rally-scenarios/rabbitmq_disaster_scenarios.py
+++ b/rally-scenarios/rabbitmq_disaster_scenarios.py
@@ -1,35 +1,36 @@
+import random
 import base_disaster_scenario
 from rally.benchmark.scenarios import base
 
 
-class BaseDisasterScenario(base_disaster_scenario.BaseDisasterScenario):
+class RabbitMQDisasterScenarios(base_disaster_scenario.BaseDisasterScenario):
 
     @base.scenario()
-    def test_rabbitmq_failover01(self):
-        """ Test Scenario:
+    def power_off_one_cantroller(self):
+        """ Poweroff one contoller and verify cloud
 
-        1. Deploy OpenStack cloud with 3 controllers
-        2. Stop RabbitMQ services on all controllers
-        3. Start RabbitMQ on one controller
-        4. Create VM 10 times, create networks, volumes, upload images,
-           create users and etc.
-        5. Start all RabbitMQ services and repeat step #4
+        Setup:
+        OpenStack cloud with at least 3 controllers
+
+        Scenario:
+        1. Poweroff one controller
+        2. Verify cloud: create VM 10 times, create networks,
+           volumes, upload images
         """
 
-        for i in xrange(0, 3):
-            self.run_disaster_command(self.context["controllers"][i],
-                                      "stop rabbitmq service")
+        controller_id = random.randint(0, len(self.context["controllers"]))
+        self.power_off_controller(controller_id)
 
-        self.run_command(self.context["controllers"][0],
-                         "/etc/init.d/rabbitmq-server start")
+        vm_list = []
+        for i in xrange(0, 10):
+            vm = self.boot_vm("test{0}".format(i))
+            vm_list.append(vm)
 
-        # (tnurlygayanov): TODO:
-        # Need to write the functions which will verify that cloud
-        # works fine: create/delete several VMs, networks, images,
-        # volumes and etc.
-        if i in xrange(0, 10):                         
-            self.boot_vm("test{0}".format(i))
+        timeout = 300
+        active_vms = []
+        while timeout > 0 and len(active_vms) < 10:
+            active_vms = [vm for vm in vm_list if vm.state == "ACTIVE"]
+            timeout -= 1
 
-        for i in xrange(0, 3):
-            self.run_command(self.context["controllers"][i],
-                             "/etc/init.d/rabbitmq-server start")
\ No newline at end of file
+        if len(active_vms) < 10:
+            raise "Can't boot VMs"
diff --git a/samples/tasks/scenarios/SampleScenario.json b/samples/tasks/scenarios/SampleScenario.json
index 33b3697..1cf1fc1 100644
--- a/samples/tasks/scenarios/SampleScenario.json
+++ b/samples/tasks/scenarios/SampleScenario.json
@@ -1,13 +1,41 @@
 {
-    "shaker_controller.sample_print": [
+    "RabbitMQDisasterScenarios.power_off_one_cantroller": [
         {
             "runner": {
                 "type": "serial",
                 "times": 5,
             },
             "context": {
+                "recover_cloud": {
+                    "checks": ["rabbitmq_cluster_status", ]
+                },
                 "cloud_nodes": {
-                    "controllers": ["1", "2", "3"]
+                    "controllers": [
+                        {
+                            "shaker_agent_id": "1",
+                            "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-1",
+                            "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-1 poweroff",
+                            "power_off_timeout": 20,
+                            "power_on_timeout": 30
+                        },
+                        {
+                            "shaker_agent_id": "2",
+                            "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-2",
+                            "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-2 poweroff",
+                            "power_off_timeout": 20,
+                            "power_on_timeout": 30
+                        },
+                        {
+                            "shaker_agent_id": "3",
+                            "hardware_power_on_cmd": "VBoxManage startvm fuel-slave-3",
+                            "hardware_power_off_cmd": "VBoxManage controlvm fuel-slave-3 poweroff",
+                            "power_off_timeout": 20,
+                            "power_on_timeout": 30
+                        }
+                    ],
+                    "power_control_node": {
+                        "shaker_agent_id": "localhost"
+                    }
                 }
             }
         }