deb-sahara/sahara/service/direct_engine.py

# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from novaclient import exceptions as nova_exceptions
from oslo.config import cfg
import six

from sahara import conductor as c
from sahara import context
from sahara import exceptions as exc
from sahara.i18n import _
from sahara.i18n import _LE
from sahara.i18n import _LI
from sahara.i18n import _LW
from sahara.openstack.common import log as logging
from sahara.service import engine as e
from sahara.service import networks
from sahara.service import volumes
from sahara.utils import cluster_progress_ops as cpo
from sahara.utils import general as g
from sahara.utils.openstack import neutron
from sahara.utils.openstack import nova


conductor = c.API
CONF = cfg.CONF
LOG = logging.getLogger(__name__)

SSH_PORT = 22


class DirectEngine(e.Engine):
    def get_type_and_version(self):
        return "direct.1.0"

    def create_cluster(self, cluster):
        ctx = context.ctx()
        self._update_rollback_strategy(cluster, shutdown=True)

        # create all instances
        cluster = g.change_cluster_status(cluster, "Spawning")
        self._create_instances(cluster)

        # wait for all instances are up and networks ready
        cluster = g.change_cluster_status(cluster, "Waiting")
        instances = g.get_instances(cluster)

        self._await_active(cluster, instances)

        self._assign_floating_ips(instances)

        self._await_networks(cluster, instances)

        cluster = conductor.cluster_get(ctx, cluster)

        # attach volumes
        volumes.attach_to_instances(g.get_instances(cluster))

        # prepare all instances
        cluster = g.change_cluster_status(cluster, "Preparing")

        self._configure_instances(cluster)

        self._update_rollback_strategy(cluster)

    def scale_cluster(self, cluster, node_group_id_map):
        ctx = context.ctx()
        cluster = g.change_cluster_status(cluster, "Scaling")

        instance_ids = self._scale_cluster_instances(cluster,
                                                     node_group_id_map)

        self._update_rollback_strategy(cluster, instance_ids=instance_ids)

        cluster = conductor.cluster_get(ctx, cluster)
        g.clean_cluster_from_empty_ng(cluster)

        cluster = conductor.cluster_get(ctx, cluster)
        instances = g.get_instances(cluster, instance_ids)

        self._await_active(cluster, instances)

        self._assign_floating_ips(instances)

        self._await_networks(cluster, instances)

        cluster = conductor.cluster_get(ctx, cluster)

        volumes.attach_to_instances(
            g.get_instances(cluster, instance_ids))

        # we should be here with valid cluster: if instances creation
        # was not successful all extra-instances will be removed above
        if instance_ids:
            self._configure_instances(cluster)

        self._update_rollback_strategy(cluster)

        return instance_ids

    def rollback_cluster(self, cluster, reason):
        rollback_info = cluster.rollback_info or {}
        self._update_rollback_strategy(cluster)

        if rollback_info.get('shutdown', False):
            self._rollback_cluster_creation(cluster, reason)
            return False

        instance_ids = rollback_info.get('instance_ids', [])
        if instance_ids:
            self._rollback_cluster_scaling(
                cluster, g.get_instances(cluster, instance_ids), reason)

            return True

        return False

    def _update_rollback_strategy(self, cluster, shutdown=False,
                                  instance_ids=None):
        rollback_info = {}
        if shutdown:
            rollback_info['shutdown'] = shutdown

        if instance_ids:
            rollback_info['instance_ids'] = instance_ids

        cluster = conductor.cluster_update(
            context.ctx(), cluster, {'rollback_info': rollback_info})
        return cluster

    # TODO(alazarev) remove when we fully switch to server groups
    def _generate_anti_affinity_groups(self, cluster):
        aa_groups = {}

        for node_group in cluster.node_groups:
            for instance in node_group.instances:
                if instance.instance_id:
                    for process in node_group.node_processes:
                        if process in cluster.anti_affinity:
                            aa_group = aa_groups.get(process, [])
                            aa_group.append(instance.instance_id)
                            aa_groups[process] = aa_group

        return aa_groups

    def _create_instances(self, cluster):
        ctx = context.ctx()

        cluster = self._create_auto_security_groups(cluster)

        aa_group = None
        if cluster.anti_affinity:
            aa_group = self._create_aa_server_group(cluster)
        cpo.add_provisioning_step(
            cluster.id, _("Run instances"), g.count_instances(cluster))

        for node_group in cluster.node_groups:
            count = node_group.count
            conductor.node_group_update(ctx, node_group, {'count': 0})
            for idx in six.moves.xrange(1, count + 1):
                self._start_instance(
                    cluster, node_group, idx, aa_group=aa_group)

    def _create_aa_server_group(self, cluster):
        server_group_name = g.generate_aa_group_name(cluster.name)
        client = nova.client().server_groups

        if client.findall(name=server_group_name):
            raise exc.InvalidDataException(
                _("Server group with name %s is already exists")
                % server_group_name)

        server_group = client.create(name=server_group_name,
                                     policies=['anti-affinity'])
        return server_group.id

    def _delete_aa_server_group(self, cluster):
        if cluster.anti_affinity:
            server_group_name = g.generate_aa_group_name(cluster.name)
            client = nova.client().server_groups

            server_groups = client.findall(name=server_group_name)
            if len(server_groups) == 1:
                client.delete(server_groups[0].id)

    def _find_aa_server_group(self, cluster):
        server_group_name = g.generate_aa_group_name(cluster.name)
        server_groups = nova.client().server_groups.findall(
            name=server_group_name)

        if len(server_groups) > 1:
            raise exc.IncorrectStateError(
                _("Several server groups with name %s found")
                % server_group_name)

        if len(server_groups) == 1:
            return server_groups[0].id

        return None

    def _create_auto_security_groups(self, cluster):
        ctx = context.ctx()
        for node_group in cluster.node_groups:
            if node_group.auto_security_group:
                self._create_auto_security_group(node_group)

        return conductor.cluster_get(ctx, cluster)

    def _count_instances_to_scale(self, node_groups_to_enlarge,
                                  node_group_id_map, cluster):

        total_count = 0
        if node_groups_to_enlarge:
            for ng in cluster.node_groups:
                if ng.id in node_groups_to_enlarge:
                    count = node_group_id_map[ng.id]
                    total_count += count - ng.count

        return total_count

    def _start_instance(self, cluster, node_group, idx, aa_group,
                        old_aa_groups=None):

        instance_name = g.generate_instance_name(
            cluster.name, node_group.name, idx)

        current_instance_info = [
            cluster.id, None, instance_name, node_group.id]

        with context.InstanceInfoManager(current_instance_info):
            instance_id = self._run_instance(
                cluster, node_group, idx,
                aa_group=aa_group, old_aa_groups=old_aa_groups)

        return instance_id

    def _scale_cluster_instances(self, cluster, node_group_id_map):
        ctx = context.ctx()

        aa_group = None
        old_aa_groups = None
        if cluster.anti_affinity:
            aa_group = self._find_aa_server_group(cluster)
            if not aa_group:
                old_aa_groups = self._generate_anti_affinity_groups(cluster)

        instances_to_delete = []
        node_groups_to_enlarge = set()
        node_groups_to_delete = set()

        for node_group in cluster.node_groups:
            new_count = node_group_id_map[node_group.id]

            if new_count < node_group.count:
                instances_to_delete += node_group.instances[new_count:
                                                            node_group.count]
                if new_count == 0:
                    node_groups_to_delete.add(node_group.id)
            elif new_count > node_group.count:
                node_groups_to_enlarge.add(node_group.id)
                if node_group.count == 0 and node_group.auto_security_group:
                    self._create_auto_security_group(node_group)

        if instances_to_delete:
            cluster = g.change_cluster_status(cluster, "Deleting Instances")

            for instance in instances_to_delete:
                self._shutdown_instance(instance)

        self._await_deleted(cluster, instances_to_delete)
        for ng in cluster.node_groups:
            if ng.id in node_groups_to_delete:
                self._delete_auto_security_group(ng)

        cluster = conductor.cluster_get(ctx, cluster)
        instances_to_add = []
        if node_groups_to_enlarge:

            cpo.add_provisioning_step(
                cluster.id, _("Add instances"),
                self._count_instances_to_scale(
                    node_groups_to_enlarge, node_group_id_map, cluster))

            cluster = g.change_cluster_status(cluster, "Adding Instances")
            for ng in cluster.node_groups:
                if ng.id in node_groups_to_enlarge:
                    count = node_group_id_map[ng.id]
                    for idx in six.moves.xrange(ng.count + 1, count + 1):
                        instance_id = self._start_instance(
                            cluster, ng, idx, aa_group, old_aa_groups)
                        instances_to_add.append(instance_id)

        return instances_to_add

    def _map_security_groups(self, security_groups):
        if not security_groups:
            # Nothing to do here
            return None

        if CONF.use_neutron:
            # When using Neutron, ids work fine.
            return security_groups
        else:
            # Nova Network requires that security groups are passed by names.
            # security_groups.get method accepts both ID and names, so in case
            # IDs are provided they will be converted, otherwise the names will
            # just map to themselves.
            names = []
            for group_id_or_name in security_groups:
                group = nova.client().security_groups.get(group_id_or_name)
                names.append(group.name)
            return names

    @cpo.event_wrapper_without_instance(mark_successful_on_exit=True)
    def _run_instance(self, cluster, node_group, idx, aa_group=None,
                      old_aa_groups=None):
        """Create instance using nova client and persist them into DB."""
        ctx = context.ctx()
        name = g.generate_instance_name(cluster.name, node_group.name, idx)

        userdata = self._generate_user_data_script(node_group, name)

        if old_aa_groups:
            # aa_groups: node process -> instance ids
            aa_ids = []
            for node_process in node_group.node_processes:
                aa_ids += old_aa_groups.get(node_process) or []

            # create instances only at hosts w/ no instances
            # w/ aa-enabled processes
            hints = {'different_host': sorted(set(aa_ids))} if aa_ids else None
        else:
            hints = {'group': aa_group} if (
                aa_group and self._need_aa_server_group(node_group)) else None

        security_groups = self._map_security_groups(node_group.security_groups)
        nova_kwargs = {'scheduler_hints': hints, 'userdata': userdata,
                       'key_name': cluster.user_keypair_id,
                       'security_groups': security_groups,
                       'availability_zone': node_group.availability_zone}
        if CONF.use_neutron:
            net_id = cluster.neutron_management_network
            nova_kwargs['nics'] = [{"net-id": net_id, "v4-fixed-ip": ""}]

        nova_instance = nova.client().servers.create(name,
                                                     node_group.get_image_id(),
                                                     node_group.flavor_id,
                                                     **nova_kwargs)
        instance_id = conductor.instance_add(ctx, node_group,
                                             {"instance_id": nova_instance.id,
                                              "instance_name": name})

        if old_aa_groups:
            # save instance id to aa_groups to support aa feature
            for node_process in node_group.node_processes:
                if node_process in cluster.anti_affinity:
                    aa_group_ids = old_aa_groups.get(node_process, [])
                    aa_group_ids.append(nova_instance.id)
                    old_aa_groups[node_process] = aa_group_ids

        return instance_id

    def _create_auto_security_group(self, node_group):
        name = g.generate_auto_security_group_name(node_group)
        nova_client = nova.client()
        security_group = nova_client.security_groups.create(
            name, "Auto security group created by Sahara for Node Group '%s' "
                  "of cluster '%s'." %
                  (node_group.name, node_group.cluster.name))

        # ssh remote needs ssh port, agents are not implemented yet
        nova_client.security_group_rules.create(
            security_group.id, 'tcp', SSH_PORT, SSH_PORT, "0.0.0.0/0")

        # open all traffic for private networks
        if CONF.use_neutron:
            for cidr in neutron.get_private_network_cidrs(node_group.cluster):
                for protocol in ['tcp', 'udp']:
                    nova_client.security_group_rules.create(
                        security_group.id, protocol, 1, 65535, cidr)

                nova_client.security_group_rules.create(
                    security_group.id, 'icmp', -1, -1, cidr)

        # enable ports returned by plugin
        for port in node_group.open_ports:
            nova_client.security_group_rules.create(
                security_group.id, 'tcp', port, port, "0.0.0.0/0")

        security_groups = list(node_group.security_groups or [])
        security_groups.append(security_group.id)
        conductor.node_group_update(context.ctx(), node_group,
                                    {"security_groups": security_groups})
        return security_groups

    def _need_aa_server_group(self, node_group):
        for node_process in node_group.node_processes:
            if node_process in node_group.cluster.anti_affinity:
                return True
        return False

    def _assign_floating_ips(self, instances):
        for instance in instances:
            node_group = instance.node_group
            if node_group.floating_ip_pool:
                networks.assign_floating_ip(instance.instance_id,
                                            node_group.floating_ip_pool)

    def _await_active(self, cluster, instances):
        """Await all instances are in Active status and available."""
        if not instances:
            return

        cpo.add_provisioning_step(
            cluster.id, _("Wait for instances to become active"),
            len(instances))

        active_ids = set()
        while len(active_ids) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in active_ids:
                    if self._check_if_active(instance):
                        active_ids.add(instance.id)
                        cpo.add_successful_event(instance)

            context.sleep(1)

        LOG.info(_LI("Cluster '%s': all instances are active"), cluster.id)

    def _await_deleted(self, cluster, instances):
        """Await all instances are deleted."""
        if not instances:
            return
        cpo.add_provisioning_step(
            cluster.id, _("Wait for instances to be deleted"), len(instances))

        deleted_ids = set()
        while len(deleted_ids) != len(instances):
            if not g.check_cluster_exists(cluster):
                return
            for instance in instances:
                if instance.id not in deleted_ids:
                    if self._check_if_deleted(instance):
                        LOG.debug("Instance '%s' is deleted" %
                                  instance.instance_name)
                        deleted_ids.add(instance.id)
                        cpo.add_successful_event(instance)

            context.sleep(1)

    @cpo.event_wrapper(mark_successful_on_exit=False)
    def _check_if_active(self, instance):
        server = nova.get_instance_info(instance)
        if server.status == 'ERROR':
            raise exc.SystemError(_("Node %s has error status") % server.name)

        return server.status == 'ACTIVE'

    @cpo.event_wrapper(mark_successful_on_exit=False)
    def _check_if_deleted(self, instance):
        try:
            nova.get_instance_info(instance)
        except nova_exceptions.NotFound:
            return True

        return False

    def _rollback_cluster_creation(self, cluster, ex):
        """Shutdown all instances and update cluster status."""
        LOG.info(_LI("Cluster '%(name)s' creation rollback "
                     "(reason: %(reason)s)"),
                 {'name': cluster.name, 'reason': ex})

        self.shutdown_cluster(cluster)

    def _rollback_cluster_scaling(self, cluster, instances, ex):
        """Attempt to rollback cluster scaling."""
        LOG.info(_LI("Cluster '%(name)s' scaling rollback "
                     "(reason: %(reason)s)"),
                 {'name': cluster.name, 'reason': ex})

        for i in instances:
            self._shutdown_instance(i)

        cluster = conductor.cluster_get(context.ctx(), cluster)
        g.clean_cluster_from_empty_ng(cluster)

    def _shutdown_instances(self, cluster):
        for node_group in cluster.node_groups:
            for instance in node_group.instances:
                self._shutdown_instance(instance)

            self._await_deleted(cluster, node_group.instances)
            self._delete_auto_security_group(node_group)

    def _delete_auto_security_group(self, node_group):
        if not node_group.auto_security_group:
            return

        if not node_group.security_groups:
            # node group has no security groups
            # nothing to delete
            return

        name = node_group.security_groups[-1]

        try:
            client = nova.client().security_groups
            security_group = client.get(name)
            if (security_group.name !=
                    g.generate_auto_security_group_name(node_group)):
                LOG.warn(_LW("Auto security group for node group %s is not "
                             "found"), node_group.name)
                return
            client.delete(name)
        except Exception:
            LOG.exception(_LE("Failed to delete security group %s"), name)

    def _shutdown_instance(self, instance):
        ctx = context.ctx()

        if instance.node_group.floating_ip_pool:
            try:
                networks.delete_floating_ip(instance.instance_id)
            except nova_exceptions.NotFound:
                LOG.warn(_LW("Attempted to delete non-existent floating IP in "
                         "pool %(pool)s from instance %(instance)s"),
                         {'pool': instance.node_group.floating_ip_pool,
                          'instance': instance.instance_id})

        try:
            volumes.detach_from_instance(instance)
        except Exception:
            LOG.warn(_LW("Detaching volumes from instance %s failed"),
                     instance.instance_id)

        try:
            nova.client().servers.delete(instance.instance_id)
        except nova_exceptions.NotFound:
            LOG.warn(_LW("Attempted to delete non-existent instance %s"),
                     instance.instance_id)

        conductor.instance_remove(ctx, instance)

    def shutdown_cluster(self, cluster):
        """Shutdown specified cluster and all related resources."""
        self._shutdown_instances(cluster)
        self._clean_job_executions(cluster)
        self._delete_aa_server_group(cluster)