From 1be42880706dd45d15686bc3eabeb58af546bfb0 Mon Sep 17 00:00:00 2001
From: dagnello <dagnello@hp.com>
Date: Wed, 11 Nov 2015 12:50:10 -0800
Subject: [PATCH] Adding support for error details on cluster creation

Cue's current implementation does not provide any additional information
to the user related to why a cluster went into ERROR state during
cluster creation.  This patch adds error_detail field in the
clusters table and exposes this field through the API.  The
update_cluster_task now extracts failure flow details and forwards this
information on cluster record update.

Partial-Bug:  1508730
(updates also required to Cue client and dashboard)
Closes-bug:  1516735
Change-Id: I440fec30bc3d051189585479b8d2c1219a98cf40
---
 cue/api/controllers/v1/cluster.py             |  3 ++
 ...8e0479e_add_cluster_error_detail_column.py | 41 ++++++++++++++
 cue/db/sqlalchemy/models.py                   |  1 +
 cue/objects/cluster.py                        |  1 +
 cue/taskflow/flow/create_cluster.py           |  1 +
 cue/taskflow/flow/create_cluster_node.py      |  1 +
 .../task/update_cluster_record_task.py        |  6 +++
 .../db/manage/database/test_migration.py      |  2 +
 cue/tests/functional/db/test_models.py        |  7 +++
 .../taskflow/flow/test_create_cluster.py      | 53 ++++++++++++++++++-
 os_tasklib/common/check_for.py                | 10 +++-
 11 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 cue/db/sqlalchemy/alembic/versions/17c428e0479e_add_cluster_error_detail_column.py

diff --git a/cue/api/controllers/v1/cluster.py b/cue/api/controllers/v1/cluster.py
index 2cb0c88b..ba235ba1 100644
--- a/cue/api/controllers/v1/cluster.py
+++ b/cue/api/controllers/v1/cluster.py
@@ -118,6 +118,9 @@ class Cluster(base.APIBase):
     authentication = wtypes.wsattr(AuthenticationCredential)
     "Authentication for accessing message brokers"
 
+    error_detail = wsme.wsattr(wtypes.text, mandatory=False)
+    "Error detail(s) associated with cluster"
+
 
 def get_complete_cluster(context, cluster_id):
     """Helper to retrieve the api-compatible full structure of a cluster."""
diff --git a/cue/db/sqlalchemy/alembic/versions/17c428e0479e_add_cluster_error_detail_column.py b/cue/db/sqlalchemy/alembic/versions/17c428e0479e_add_cluster_error_detail_column.py
new file mode 100644
index 00000000..0e33dcd3
--- /dev/null
+++ b/cue/db/sqlalchemy/alembic/versions/17c428e0479e_add_cluster_error_detail_column.py
@@ -0,0 +1,41 @@
+# -*- encoding: utf-8 -*-
+#
+# Copyright 2015 Hewlett-Packard Development Company, L.P.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+"""add error_detail column in clusters
+
+Revision ID: 17c428e0479e
+Revises: 244aa473e595
+Create Date: 2015-11-11 12:01:10.769280
+
+"""
+
+# revision identifiers, used by Alembic.
+revision = '17c428e0479e'
+down_revision = '244aa473e595'
+
+from alembic import op
+from oslo_config import cfg
+import sqlalchemy as sa
+
+
+def upgrade():
+     op.add_column('clusters', sa.Column('error_detail', sa.Text))
+
+
+def downgrade():
+    db_connection = cfg.CONF.database.connection
+    if db_connection != "sqlite://":  # pragma: nocover
+        op.drop_column('clusters', 'error_detail')
\ No newline at end of file
diff --git a/cue/db/sqlalchemy/models.py b/cue/db/sqlalchemy/models.py
index 50ccc79f..0456f8a7 100644
--- a/cue/db/sqlalchemy/models.py
+++ b/cue/db/sqlalchemy/models.py
@@ -73,6 +73,7 @@ class Cluster(base.BASE, base.IdMixin, models.TimestampMixin,
     flavor = sa.Column(sa.String(50), nullable=False)
     size = sa.Column(sa.Integer(), default=1, nullable=False)
     volume_size = sa.Column(sa.Integer(), nullable=True)
+    error_detail = sa.Column(sa.Text(), nullable=True)
     sa.Index("clusters_cluster_id_idx", "cluster_id", unique=True)
 
 
diff --git a/cue/objects/cluster.py b/cue/objects/cluster.py
index 8cb68295..a3f3db8e 100644
--- a/cue/objects/cluster.py
+++ b/cue/objects/cluster.py
@@ -38,6 +38,7 @@ class Cluster(base.CueObject):
         'created_at': obj_utils.datetime_or_str_or_none,
         'updated_at': obj_utils.datetime_or_str_or_none,
         'deleted_at': obj_utils.datetime_or_str_or_none,
+        'error_detail': obj_utils.str_or_none,
     }
 
     @staticmethod
diff --git a/cue/taskflow/flow/create_cluster.py b/cue/taskflow/flow/create_cluster.py
index 2a77638d..b6bbf270 100644
--- a/cue/taskflow/flow/create_cluster.py
+++ b/cue/taskflow/flow/create_cluster.py
@@ -74,6 +74,7 @@ def create_cluster(cluster_id, node_ids, user_network_id,
             inject={'proto': 'http'}),
         os_common.CheckFor(
             name="check cluster status",
+            details="waiting for RabbitMQ clustered status",
             rebind={'check_var': "clustering_status"},
             check_value='OK',
             retry_delay_seconds=10),
diff --git a/cue/taskflow/flow/create_cluster_node.py b/cue/taskflow/flow/create_cluster_node.py
index 775f0856..adfeee5e 100644
--- a/cue/taskflow/flow/create_cluster_node.py
+++ b/cue/taskflow/flow/create_cluster_node.py
@@ -153,6 +153,7 @@ def create_cluster_node(cluster_id, node_number, node_id, graph_flow,
             provides="vm_status_%d" % node_number),
         os_common.CheckFor(
             name="check vm status %s" % node_name,
+            details="waiting for ACTIVE VM status",
             rebind={'check_var': "vm_status_%d" % node_number},
             check_value='ACTIVE',
             retry_delay_seconds=10),
diff --git a/cue/taskflow/task/update_cluster_record_task.py b/cue/taskflow/task/update_cluster_record_task.py
index 2d2ed8d2..3327c9a4 100644
--- a/cue/taskflow/task/update_cluster_record_task.py
+++ b/cue/taskflow/task/update_cluster_record_task.py
@@ -68,5 +68,11 @@ class UpdateClusterRecord(task.Task):
         else:
             cluster_values['status'] = models.Status.ERROR
 
+        # Extract exception information
+        if 'flow_failures' in kwargs:
+            cluster_values['error_detail'] = '\n'.join(
+                [str(value) for value in
+                 kwargs['flow_failures'].values()])
+
         cluster = objects.Cluster(**cluster_values)
         cluster.update(request_context, cluster_id)
\ No newline at end of file
diff --git a/cue/tests/functional/db/manage/database/test_migration.py b/cue/tests/functional/db/manage/database/test_migration.py
index d26ea0c3..57516e9e 100644
--- a/cue/tests/functional/db/manage/database/test_migration.py
+++ b/cue/tests/functional/db/manage/database/test_migration.py
@@ -53,6 +53,8 @@ class MigrationFunctionalTests(base.BaseTestCase):
             ('3917e931a55a', ['clusters', 'endpoints', 'nodes', 'broker',
                               'broker_metadata']),
             ('244aa473e595', ['clusters', 'endpoints', 'nodes', 'broker',
+                              'broker_metadata']),
+            ('17c428e0479e', ['clusters', 'endpoints', 'nodes', 'broker',
                               'broker_metadata'])
         ])
 
diff --git a/cue/tests/functional/db/test_models.py b/cue/tests/functional/db/test_models.py
index f1916296..4fc4b3da 100644
--- a/cue/tests/functional/db/test_models.py
+++ b/cue/tests/functional/db/test_models.py
@@ -46,6 +46,7 @@ class ModelsTests(base.FunctionalTestCase):
             "created_at": timeutils.utcnow(),
             "updated_at": timeutils.utcnow(),
             "deleted_at": timeutils.utcnow(),
+            "error_detail": "My cluster's error(s) detail",
         }
 
         cluster = models.Cluster()
@@ -75,6 +76,9 @@ class ModelsTests(base.FunctionalTestCase):
                          "Invalid updated_at value")
         self.assertEqual(cluster_values["deleted_at"], cluster.deleted_at,
                          "Invalid deleted_at value")
+        self.assertEqual(cluster_values["error_detail"],
+                         cluster.error_detail,
+                         "Invalid error_detail value")
 
         db_session = sql_api.get_session()
         cluster.save(db_session)
@@ -107,6 +111,9 @@ class ModelsTests(base.FunctionalTestCase):
                          "Invalid updated_at value")
         self.assertEqual(cluster_values["deleted_at"], cluster_db.deleted_at,
                          "Invalid deleted_at value")
+        self.assertEqual(cluster_values["error_detail"],
+                         cluster_db.error_detail,
+                         "Invalid error_detail value")
 
     def test_create_node_model(self):
         """Verifies a new cluster record is created in DB."""
diff --git a/cue/tests/functional/taskflow/flow/test_create_cluster.py b/cue/tests/functional/taskflow/flow/test_create_cluster.py
index cb174141..7dee71f3 100644
--- a/cue/tests/functional/taskflow/flow/test_create_cluster.py
+++ b/cue/tests/functional/taskflow/flow/test_create_cluster.py
@@ -93,7 +93,7 @@ class CreateClusterTests(base.FunctionalTestCase):
         cluster_values = {
             "project_id": self.context.tenant_id,
             "name": "RabbitCluster",
-            "network_id": str(uuid.uuid4()),
+            "network_id": self.valid_network['id'],
             "flavor": "1",
             "size": 3,
         }
@@ -165,7 +165,7 @@ class CreateClusterTests(base.FunctionalTestCase):
         cluster_values = {
             "project_id": self.context.tenant_id,
             "name": "RabbitCluster",
-            "network_id": str(uuid.uuid4()),
+            "network_id": self.valid_network['id'],
             "flavor": "1",
             "size": 10,
         }
@@ -191,6 +191,55 @@ class CreateClusterTests(base.FunctionalTestCase):
         self.assertEqual(vm_list, self.nova_client.servers.list())
         self.assertEqual(port_list, self.neutron_client.list_ports())
 
+    def test_create_cluster_invalid_user_network(self):
+        invalid_network_id = str(uuid.uuid4())
+        cluster_size = 3
+
+        flow_store = {
+            'image': self.valid_image.id,
+            'flavor': self.valid_flavor.id,
+            "port": self.port,
+            "context": self.context.to_dict(),
+            "erlang_cookie": str(uuid.uuid4()),
+            "default_rabbit_user": 'rabbit',
+            "default_rabbit_pass": str(uuid.uuid4()),
+        }
+
+        cluster_values = {
+            "project_id": self.context.tenant_id,
+            "name": "RabbitCluster",
+            "network_id": invalid_network_id,
+            "flavor": "1",
+            "size": cluster_size,
+        }
+
+        new_cluster = objects.Cluster(**cluster_values)
+        new_cluster.create(self.context)
+
+        nodes = objects.Node.get_nodes_by_cluster_id(self.context,
+                                                     new_cluster.id)
+
+        node_ids = []
+        for node in nodes:
+            node_ids.append(node.id)
+
+        flow = create_cluster(new_cluster.id,
+                              node_ids,
+                              invalid_network_id,
+                              self.management_network['id'])
+
+        try:
+            engines.run(flow, store=flow_store)
+        except taskflow_exc.WrappedFailure as err:
+            cluster_ref = objects.Cluster.get_cluster_by_id(self.context,
+                                                            new_cluster.id)
+            self.assertEqual(cluster_size, len(err._causes))
+            for failure in err._causes:
+                self.assertEqual(cluster_ref.error_detail,
+                                 failure.__str__())
+        else:
+            self.fail("Expected taskflow_exc.WrappedFailure exception.")
+
     def tearDown(self):
         for vm_id in self.new_vm_list:
             self.nova_client.servers.delete(vm_id)
diff --git a/os_tasklib/common/check_for.py b/os_tasklib/common/check_for.py
index 056e39a6..848e944a 100644
--- a/os_tasklib/common/check_for.py
+++ b/os_tasklib/common/check_for.py
@@ -24,11 +24,13 @@ class CheckFor(task.Task):
                  retry_delay_seconds=None,
                  retry_delay_ms=None,
                  name=None,
+                 details=None,
                  **kwargs):
         super(CheckFor, self).__init__(name=name, **kwargs)
 
         self.check_value = check_value
         self.sleep_time = 0
+        self.details = details
         if retry_delay_seconds:
             self.sleep_time = retry_delay_seconds
 
@@ -39,8 +41,12 @@ class CheckFor(task.Task):
         if check_var == self.check_value:
             return self.check_value
         else:
-            raise AssertionError("expected %s, got %s" %
-                                 (self.check_value, check_var))
+            error_string = "expected %s, got %s" % (self.check_value,
+                                                    check_var)
+            if self.details is not None:
+                error_string += ", message: %s" % self.details
+
+            raise AssertionError(error_string)
 
     def revert(self, check_var, *args, **kwargs):
         if self.sleep_time != 0: