From 1be42880706dd45d15686bc3eabeb58af546bfb0 Mon Sep 17 00:00:00 2001 From: dagnello Date: Wed, 11 Nov 2015 12:50:10 -0800 Subject: [PATCH] Adding support for error details on cluster creation Cue's current implementation does not provide any additional information to the user related to why a cluster went into ERROR state during cluster creation. This patch adds error_detail field in the clusters table and exposes this field through the API. The update_cluster_task now extracts failure flow details and forwards this information on cluster record update. Partial-Bug: 1508730 (updates also required to Cue client and dashboard) Closes-bug: 1516735 Change-Id: I440fec30bc3d051189585479b8d2c1219a98cf40 --- cue/api/controllers/v1/cluster.py | 3 ++ ...8e0479e_add_cluster_error_detail_column.py | 41 ++++++++++++++ cue/db/sqlalchemy/models.py | 1 + cue/objects/cluster.py | 1 + cue/taskflow/flow/create_cluster.py | 1 + cue/taskflow/flow/create_cluster_node.py | 1 + .../task/update_cluster_record_task.py | 6 +++ .../db/manage/database/test_migration.py | 2 + cue/tests/functional/db/test_models.py | 7 +++ .../taskflow/flow/test_create_cluster.py | 53 ++++++++++++++++++- os_tasklib/common/check_for.py | 10 +++- 11 files changed, 122 insertions(+), 4 deletions(-) create mode 100644 cue/db/sqlalchemy/alembic/versions/17c428e0479e_add_cluster_error_detail_column.py diff --git a/cue/api/controllers/v1/cluster.py b/cue/api/controllers/v1/cluster.py index 2cb0c88b..ba235ba1 100644 --- a/cue/api/controllers/v1/cluster.py +++ b/cue/api/controllers/v1/cluster.py @@ -118,6 +118,9 @@ class Cluster(base.APIBase): authentication = wtypes.wsattr(AuthenticationCredential) "Authentication for accessing message brokers" + error_detail = wsme.wsattr(wtypes.text, mandatory=False) + "Error detail(s) associated with cluster" + def get_complete_cluster(context, cluster_id): """Helper to retrieve the api-compatible full structure of a cluster.""" diff --git a/cue/db/sqlalchemy/alembic/versions/17c428e0479e_add_cluster_error_detail_column.py b/cue/db/sqlalchemy/alembic/versions/17c428e0479e_add_cluster_error_detail_column.py new file mode 100644 index 00000000..0e33dcd3 --- /dev/null +++ b/cue/db/sqlalchemy/alembic/versions/17c428e0479e_add_cluster_error_detail_column.py @@ -0,0 +1,41 @@ +# -*- encoding: utf-8 -*- +# +# Copyright 2015 Hewlett-Packard Development Company, L.P. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +"""add error_detail column in clusters + +Revision ID: 17c428e0479e +Revises: 244aa473e595 +Create Date: 2015-11-11 12:01:10.769280 + +""" + +# revision identifiers, used by Alembic. +revision = '17c428e0479e' +down_revision = '244aa473e595' + +from alembic import op +from oslo_config import cfg +import sqlalchemy as sa + + +def upgrade(): + op.add_column('clusters', sa.Column('error_detail', sa.Text)) + + +def downgrade(): + db_connection = cfg.CONF.database.connection + if db_connection != "sqlite://": # pragma: nocover + op.drop_column('clusters', 'error_detail') \ No newline at end of file diff --git a/cue/db/sqlalchemy/models.py b/cue/db/sqlalchemy/models.py index 50ccc79f..0456f8a7 100644 --- a/cue/db/sqlalchemy/models.py +++ b/cue/db/sqlalchemy/models.py @@ -73,6 +73,7 @@ class Cluster(base.BASE, base.IdMixin, models.TimestampMixin, flavor = sa.Column(sa.String(50), nullable=False) size = sa.Column(sa.Integer(), default=1, nullable=False) volume_size = sa.Column(sa.Integer(), nullable=True) + error_detail = sa.Column(sa.Text(), nullable=True) sa.Index("clusters_cluster_id_idx", "cluster_id", unique=True) diff --git a/cue/objects/cluster.py b/cue/objects/cluster.py index 8cb68295..a3f3db8e 100644 --- a/cue/objects/cluster.py +++ b/cue/objects/cluster.py @@ -38,6 +38,7 @@ class Cluster(base.CueObject): 'created_at': obj_utils.datetime_or_str_or_none, 'updated_at': obj_utils.datetime_or_str_or_none, 'deleted_at': obj_utils.datetime_or_str_or_none, + 'error_detail': obj_utils.str_or_none, } @staticmethod diff --git a/cue/taskflow/flow/create_cluster.py b/cue/taskflow/flow/create_cluster.py index 2a77638d..b6bbf270 100644 --- a/cue/taskflow/flow/create_cluster.py +++ b/cue/taskflow/flow/create_cluster.py @@ -74,6 +74,7 @@ def create_cluster(cluster_id, node_ids, user_network_id, inject={'proto': 'http'}), os_common.CheckFor( name="check cluster status", + details="waiting for RabbitMQ clustered status", rebind={'check_var': "clustering_status"}, check_value='OK', retry_delay_seconds=10), diff --git a/cue/taskflow/flow/create_cluster_node.py b/cue/taskflow/flow/create_cluster_node.py index 775f0856..adfeee5e 100644 --- a/cue/taskflow/flow/create_cluster_node.py +++ b/cue/taskflow/flow/create_cluster_node.py @@ -153,6 +153,7 @@ def create_cluster_node(cluster_id, node_number, node_id, graph_flow, provides="vm_status_%d" % node_number), os_common.CheckFor( name="check vm status %s" % node_name, + details="waiting for ACTIVE VM status", rebind={'check_var': "vm_status_%d" % node_number}, check_value='ACTIVE', retry_delay_seconds=10), diff --git a/cue/taskflow/task/update_cluster_record_task.py b/cue/taskflow/task/update_cluster_record_task.py index 2d2ed8d2..3327c9a4 100644 --- a/cue/taskflow/task/update_cluster_record_task.py +++ b/cue/taskflow/task/update_cluster_record_task.py @@ -68,5 +68,11 @@ class UpdateClusterRecord(task.Task): else: cluster_values['status'] = models.Status.ERROR + # Extract exception information + if 'flow_failures' in kwargs: + cluster_values['error_detail'] = '\n'.join( + [str(value) for value in + kwargs['flow_failures'].values()]) + cluster = objects.Cluster(**cluster_values) cluster.update(request_context, cluster_id) \ No newline at end of file diff --git a/cue/tests/functional/db/manage/database/test_migration.py b/cue/tests/functional/db/manage/database/test_migration.py index d26ea0c3..57516e9e 100644 --- a/cue/tests/functional/db/manage/database/test_migration.py +++ b/cue/tests/functional/db/manage/database/test_migration.py @@ -53,6 +53,8 @@ class MigrationFunctionalTests(base.BaseTestCase): ('3917e931a55a', ['clusters', 'endpoints', 'nodes', 'broker', 'broker_metadata']), ('244aa473e595', ['clusters', 'endpoints', 'nodes', 'broker', + 'broker_metadata']), + ('17c428e0479e', ['clusters', 'endpoints', 'nodes', 'broker', 'broker_metadata']) ]) diff --git a/cue/tests/functional/db/test_models.py b/cue/tests/functional/db/test_models.py index f1916296..4fc4b3da 100644 --- a/cue/tests/functional/db/test_models.py +++ b/cue/tests/functional/db/test_models.py @@ -46,6 +46,7 @@ class ModelsTests(base.FunctionalTestCase): "created_at": timeutils.utcnow(), "updated_at": timeutils.utcnow(), "deleted_at": timeutils.utcnow(), + "error_detail": "My cluster's error(s) detail", } cluster = models.Cluster() @@ -75,6 +76,9 @@ class ModelsTests(base.FunctionalTestCase): "Invalid updated_at value") self.assertEqual(cluster_values["deleted_at"], cluster.deleted_at, "Invalid deleted_at value") + self.assertEqual(cluster_values["error_detail"], + cluster.error_detail, + "Invalid error_detail value") db_session = sql_api.get_session() cluster.save(db_session) @@ -107,6 +111,9 @@ class ModelsTests(base.FunctionalTestCase): "Invalid updated_at value") self.assertEqual(cluster_values["deleted_at"], cluster_db.deleted_at, "Invalid deleted_at value") + self.assertEqual(cluster_values["error_detail"], + cluster_db.error_detail, + "Invalid error_detail value") def test_create_node_model(self): """Verifies a new cluster record is created in DB.""" diff --git a/cue/tests/functional/taskflow/flow/test_create_cluster.py b/cue/tests/functional/taskflow/flow/test_create_cluster.py index cb174141..7dee71f3 100644 --- a/cue/tests/functional/taskflow/flow/test_create_cluster.py +++ b/cue/tests/functional/taskflow/flow/test_create_cluster.py @@ -93,7 +93,7 @@ class CreateClusterTests(base.FunctionalTestCase): cluster_values = { "project_id": self.context.tenant_id, "name": "RabbitCluster", - "network_id": str(uuid.uuid4()), + "network_id": self.valid_network['id'], "flavor": "1", "size": 3, } @@ -165,7 +165,7 @@ class CreateClusterTests(base.FunctionalTestCase): cluster_values = { "project_id": self.context.tenant_id, "name": "RabbitCluster", - "network_id": str(uuid.uuid4()), + "network_id": self.valid_network['id'], "flavor": "1", "size": 10, } @@ -191,6 +191,55 @@ class CreateClusterTests(base.FunctionalTestCase): self.assertEqual(vm_list, self.nova_client.servers.list()) self.assertEqual(port_list, self.neutron_client.list_ports()) + def test_create_cluster_invalid_user_network(self): + invalid_network_id = str(uuid.uuid4()) + cluster_size = 3 + + flow_store = { + 'image': self.valid_image.id, + 'flavor': self.valid_flavor.id, + "port": self.port, + "context": self.context.to_dict(), + "erlang_cookie": str(uuid.uuid4()), + "default_rabbit_user": 'rabbit', + "default_rabbit_pass": str(uuid.uuid4()), + } + + cluster_values = { + "project_id": self.context.tenant_id, + "name": "RabbitCluster", + "network_id": invalid_network_id, + "flavor": "1", + "size": cluster_size, + } + + new_cluster = objects.Cluster(**cluster_values) + new_cluster.create(self.context) + + nodes = objects.Node.get_nodes_by_cluster_id(self.context, + new_cluster.id) + + node_ids = [] + for node in nodes: + node_ids.append(node.id) + + flow = create_cluster(new_cluster.id, + node_ids, + invalid_network_id, + self.management_network['id']) + + try: + engines.run(flow, store=flow_store) + except taskflow_exc.WrappedFailure as err: + cluster_ref = objects.Cluster.get_cluster_by_id(self.context, + new_cluster.id) + self.assertEqual(cluster_size, len(err._causes)) + for failure in err._causes: + self.assertEqual(cluster_ref.error_detail, + failure.__str__()) + else: + self.fail("Expected taskflow_exc.WrappedFailure exception.") + def tearDown(self): for vm_id in self.new_vm_list: self.nova_client.servers.delete(vm_id) diff --git a/os_tasklib/common/check_for.py b/os_tasklib/common/check_for.py index 056e39a6..848e944a 100644 --- a/os_tasklib/common/check_for.py +++ b/os_tasklib/common/check_for.py @@ -24,11 +24,13 @@ class CheckFor(task.Task): retry_delay_seconds=None, retry_delay_ms=None, name=None, + details=None, **kwargs): super(CheckFor, self).__init__(name=name, **kwargs) self.check_value = check_value self.sleep_time = 0 + self.details = details if retry_delay_seconds: self.sleep_time = retry_delay_seconds @@ -39,8 +41,12 @@ class CheckFor(task.Task): if check_var == self.check_value: return self.check_value else: - raise AssertionError("expected %s, got %s" % - (self.check_value, check_var)) + error_string = "expected %s, got %s" % (self.check_value, + check_var) + if self.details is not None: + error_string += ", message: %s" % self.details + + raise AssertionError(error_string) def revert(self, check_var, *args, **kwargs): if self.sleep_time != 0: