magnum/magnum/conductor/handlers/cluster_conductor.py

# Copyright 2014 NEC Corporation.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

from heatclient import exc
from oslo_log import log as logging
from pycadf import cadftaxonomy as taxonomy
import six

from magnum.common import clients
from magnum.common import exception
from magnum.common import profiler
from magnum.conductor.handlers.common import cert_manager
from magnum.conductor.handlers.common import trust_manager
from magnum.conductor import scale_manager
from magnum.conductor import utils as conductor_utils
import magnum.conf
from magnum.drivers.common import driver
from magnum.i18n import _
from magnum import objects
from magnum.objects import fields

CONF = magnum.conf.CONF

LOG = logging.getLogger(__name__)


@profiler.trace_cls("rpc")
class Handler(object):

    def __init__(self):
        super(Handler, self).__init__()

    # Cluster Operations

    def cluster_create(self, context, cluster, master_count, node_count,
                       create_timeout):
        LOG.debug('cluster_heat cluster_create')

        osc = clients.OpenStackClients(context)

        cluster.status = fields.ClusterStatus.CREATE_IN_PROGRESS
        cluster.status_reason = None
        cluster.node_count = node_count
        cluster.master_count = master_count
        cluster.create()

        # Master nodegroup
        master_ng = conductor_utils._get_nodegroup_object(
            context, cluster, master_count, is_master=True)
        master_ng.create()
        # Minion nodegroup
        minion_ng = conductor_utils._get_nodegroup_object(
            context, cluster, node_count, is_master=False)
        minion_ng.create()

        try:
            # Create trustee/trust and set them to cluster
            trust_manager.create_trustee_and_trust(osc, cluster)
            # Generate certificate and set the cert reference to cluster
            cert_manager.generate_certificates_to_cluster(cluster,
                                                          context=context)
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_PENDING)
            # Get driver
            cluster_driver = driver.Driver.get_driver_for_cluster(context,
                                                                  cluster)
            # Create cluster
            cluster_driver.create_cluster(context, cluster, create_timeout)
            cluster.save()

        except Exception as e:
            cluster.status = fields.ClusterStatus.CREATE_FAILED
            cluster.status_reason = six.text_type(e)
            cluster.save()
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_CREATE, taxonomy.OUTCOME_FAILURE)

            if isinstance(e, exc.HTTPBadRequest):
                e = exception.InvalidParameterValue(message=six.text_type(e))

                raise e
            raise

        return cluster

    def cluster_update(self, context, cluster, node_count, rollback=False):
        LOG.debug('cluster_heat cluster_update')

        osc = clients.OpenStackClients(context)
        allow_update_status = (
            fields.ClusterStatus.CREATE_COMPLETE,
            fields.ClusterStatus.UPDATE_COMPLETE,
            fields.ClusterStatus.RESUME_COMPLETE,
            fields.ClusterStatus.RESTORE_COMPLETE,
            fields.ClusterStatus.ROLLBACK_COMPLETE,
            fields.ClusterStatus.SNAPSHOT_COMPLETE,
            fields.ClusterStatus.CHECK_COMPLETE,
            fields.ClusterStatus.ADOPT_COMPLETE
        )
        if cluster.status not in allow_update_status:
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE)
            operation = _('Updating a cluster when status is '
                          '"%s"') % cluster.status
            raise exception.NotSupported(operation=operation)

        # Updates will be only reflected to the default worker
        # nodegroup.
        worker_ng = cluster.default_ng_worker
        if worker_ng.node_count == node_count:
            return
        # Backup the old node count so that we can restore it
        # in case of an exception.
        old_node_count = worker_ng.node_count

        manager = scale_manager.get_scale_manager(context, osc, cluster)

        # Get driver
        ct = conductor_utils.retrieve_cluster_template(context, cluster)
        cluster_driver = driver.Driver.get_driver(ct.server_type,
                                                  ct.cluster_distro,
                                                  ct.coe)
        # Update cluster
        try:
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING)
            worker_ng.node_count = node_count
            worker_ng.save()
            # For now update also the cluster.node_count
            cluster.node_count = node_count
            cluster_driver.update_cluster(context, cluster, manager, rollback)
            cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
            cluster.status_reason = None
        except Exception as e:
            cluster.status = fields.ClusterStatus.UPDATE_FAILED
            cluster.status_reason = six.text_type(e)
            cluster.save()
            # Restore the node_count
            worker_ng.node_count = old_node_count
            worker_ng.save()
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE)
            if isinstance(e, exc.HTTPBadRequest):
                e = exception.InvalidParameterValue(message=six.text_type(e))
                raise e
            raise

        cluster.save()
        return cluster

    def cluster_delete(self, context, uuid):
        LOG.debug('cluster_conductor cluster_delete')
        osc = clients.OpenStackClients(context)
        cluster = objects.Cluster.get_by_uuid(context, uuid)
        ct = conductor_utils.retrieve_cluster_template(context, cluster)
        cluster_driver = driver.Driver.get_driver(ct.server_type,
                                                  ct.cluster_distro,
                                                  ct.coe)
        try:
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_PENDING)
            cluster_driver.delete_cluster(context, cluster)
            cluster.status = fields.ClusterStatus.DELETE_IN_PROGRESS
            cluster.status_reason = None
        except exc.HTTPNotFound:
            LOG.info('The cluster %s was not found during cluster'
                     ' deletion.', cluster.id)
            try:
                trust_manager.delete_trustee_and_trust(osc, context, cluster)
                cert_manager.delete_certificates_from_cluster(cluster,
                                                              context=context)
                # delete all cluster's nodegroups
                for ng in cluster.nodegroups:
                    ng.destroy()
                cluster.destroy()
            except exception.ClusterNotFound:
                LOG.info('The cluster %s has been deleted by others.',
                         uuid)
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_SUCCESS)
            return None
        except exc.HTTPConflict:
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE)
            raise exception.OperationInProgress(cluster_name=cluster.name)
        except Exception as unexp:
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_DELETE, taxonomy.OUTCOME_FAILURE)
            cluster.status = fields.ClusterStatus.DELETE_FAILED
            cluster.status_reason = six.text_type(unexp)
            cluster.save()
            raise

        cluster.save()
        return None

    def cluster_resize(self, context, cluster,
                       node_count, nodes_to_remove, nodegroup):
        LOG.debug('cluster_conductor cluster_resize')

        osc = clients.OpenStackClients(context)
        # NOTE(flwang): One of important user cases of /resize API is
        # supporting the auto scaling action triggered by Kubernetes Cluster
        # Autoscaler, so there are 2 cases may happen:
        # 1. API could be triggered very offen
        # 2. Scale up or down may fail and we would like to offer the ability
        #    that recover the cluster to allow it being resized when last
        #    update failed.
        allow_update_status = (
            fields.ClusterStatus.CREATE_COMPLETE,
            fields.ClusterStatus.UPDATE_COMPLETE,
            fields.ClusterStatus.RESUME_COMPLETE,
            fields.ClusterStatus.RESTORE_COMPLETE,
            fields.ClusterStatus.ROLLBACK_COMPLETE,
            fields.ClusterStatus.SNAPSHOT_COMPLETE,
            fields.ClusterStatus.CHECK_COMPLETE,
            fields.ClusterStatus.ADOPT_COMPLETE,
            fields.ClusterStatus.UPDATE_FAILED,
            fields.ClusterStatus.UPDATE_IN_PROGRESS,
        )
        if cluster.status not in allow_update_status:
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE)
            operation = _('Resizing a cluster when status is '
                          '"%s"') % cluster.status
            raise exception.NotSupported(operation=operation)

        resize_manager = scale_manager.get_scale_manager(context, osc, cluster)

        # Get driver
        ct = conductor_utils.retrieve_cluster_template(context, cluster)
        cluster_driver = driver.Driver.get_driver(ct.server_type,
                                                  ct.cluster_distro,
                                                  ct.coe)
        # Backup the old node count so that we can restore it
        # in case of an exception.
        old_node_count = nodegroup.node_count

        # Resize cluster
        try:
            nodegroup.node_count = node_count
            nodegroup.save()
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_PENDING)
            cluster_driver.resize_cluster(context, cluster, resize_manager,
                                          node_count, nodes_to_remove,
                                          nodegroup)
            cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
            cluster.status_reason = None
        except Exception as e:
            cluster.status = fields.ClusterStatus.UPDATE_FAILED
            cluster.status_reason = six.text_type(e)
            cluster.save()
            nodegroup.node_count = old_node_count
            nodegroup.save()
            conductor_utils.notify_about_cluster_operation(
                context, taxonomy.ACTION_UPDATE, taxonomy.OUTCOME_FAILURE)
            if isinstance(e, exc.HTTPBadRequest):
                e = exception.InvalidParameterValue(message=six.text_type(e))
                raise e
            raise

        cluster.save()
        return cluster