mogan/mogan/engine/manager.py

# Copyright 2016 Huawei Technologies Co.,LTD.
# All Rights Reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.

from oslo_log import log
import oslo_messaging as messaging
from oslo_service import periodic_task
from oslo_utils import excutils
from oslo_utils import timeutils

from mogan.common import exception
from mogan.common import flow_utils
from mogan.common.i18n import _
from mogan.common import states
from mogan.common import utils
from mogan.conf import CONF
from mogan.engine import base_manager
from mogan.engine.flows import create_instance
from mogan.notifications import base as notifications
from mogan import objects
from mogan.objects import fields

LOG = log.getLogger(__name__)


class EngineManager(base_manager.BaseEngineManager):
    """Mogan Engine manager main class."""

    RPC_API_VERSION = '1.0'

    target = messaging.Target(version=RPC_API_VERSION)

    def _get_compute_port(self, context, port_uuid):
        """Gets compute port by the uuid."""
        try:
            return objects.ComputePort.get(context, port_uuid)
        except exception.NotFound:
            LOG.warning("No compute port record for %(port)s",
                        {'port': port_uuid})

    def _get_compute_node(self, context, node_uuid):
        """Gets compute node by the uuid."""
        try:
            return objects.ComputeNode.get(context, node_uuid)
        except exception.NotFound:
            LOG.warning("No compute node record for %(node)s",
                        {'node': node_uuid})

    def _init_compute_port(self, context, port):
        """Initialize the compute port if it does not already exist.

        :param context: security context
        :param port: initial values
        """

        # now try to get the compute port record from the
        # database. If we get one we use resources to initialize
        cp = self._get_compute_port(context, port['port_uuid'])
        if cp:
            cp.update_from_driver(port)
            cp.save()
            return

        # there was no compute port in the database so we need to create
        # a new compute port. This needs to be initialized with node values.
        cp = objects.ComputePort(context)
        cp.update_from_driver(port)
        cp.create()

    def _init_compute_node(self, context, node):
        """Initialize the compute node if it does not already exist.

        :param context: security context
        :param node: initial values
        """

        # now try to get the compute node record from the
        # database. If we get one we use resources to initialize
        cn = self._get_compute_node(context, node['node_uuid'])
        if cn:
            cn.update_from_driver(node)
            cn.save()
        else:
            # there was no compute node in the database so we need to
            # create a new compute node. This needs to be initialized
            # with node values.
            cn = objects.ComputeNode(context)
            cn.update_from_driver(node)
            cn.create()

        # Record compute ports to db
        for port in node['ports']:
            # initialize the compute port object, creating it
            # if it does not already exist.
            self._init_compute_port(context, port)

    @periodic_task.periodic_task(
        spacing=CONF.engine.update_resources_interval,
        run_immediately=True)
    def _update_available_resources(self, context):
        """See driver.get_available_resource()

        Periodic process that keeps that the engine's understanding of
        resource availability in sync with the underlying hypervisor.

        :param context: security context
        """
        nodes = self.driver.get_available_resources()
        compute_nodes_in_db = objects.ComputeNodeList.get_all(context)

        # Record compute nodes to db
        for uuid, node in nodes.items():
            # initialize the compute node object, creating it
            # if it does not already exist.
            self._init_compute_node(context, node)

        # Delete orphan compute node not reported by driver but still in db
        for cn in compute_nodes_in_db:
            if cn.node_uuid not in nodes:
                LOG.info("Deleting orphan compute node %(id)s)",
                         {'id': cn.node_uuid})
                cn.destroy()

    @periodic_task.periodic_task(spacing=CONF.engine.sync_power_state_interval,
                                 run_immediately=True)
    def _sync_power_states(self, context):
        """Align power states between the database and the hypervisor."""

        # Only fetching the necessary fields, will skip synchronizing if
        # target_power_state is not None.

        try:
            nodes = self.driver.get_nodes_power_state()
        except Exception as e:
            LOG.warning(
                ("Failed to retrieve node list when synchronizing power "
                 "states: %(msg)s") % {"msg": e})
            # Just retrun if we fail to get nodes real power state.
            return

        node_dict = {node.instance_uuid: node for node in nodes
                     if node.target_power_state is None}

        if not node_dict:
            LOG.warning("While synchronizing instance power states, "
                        "found none instance with stable power state "
                        "on the hypervisor.")
            return

        def _sync(db_instance, node_power_state):
            # This must be synchronized as we query state from two separate
            # sources, the driver (ironic) and the database. They are set
            # (in stop_instance) and read, in sync.
            @utils.synchronized(db_instance.uuid)
            def sync_instance_power_state():
                self._sync_instance_power_state(context, db_instance,
                                                node_power_state)

            try:
                sync_instance_power_state()
            except Exception:
                LOG.exception("Periodic sync_power_state task had an "
                              "error while processing an instance.",
                              instance=db_instance)

            self._syncs_in_progress.pop(db_instance.uuid)

        db_instances = objects.Instance.list(context)
        for db_instance in db_instances:
            # process syncs asynchronously - don't want instance locking to
            # block entire periodic task thread
            uuid = db_instance.uuid
            if uuid in self._syncs_in_progress:
                LOG.debug('Sync power state already in progress for %s', uuid)
                continue

            if db_instance.status not in (states.ACTIVE, states.STOPPED):
                if db_instance.status in states.UNSTABLE_STATES:
                    LOG.info("During sync_power_state the instance has a "
                             "pending task (%(task)s). Skip.",
                             {'task': db_instance.status},
                             instance=db_instance)
                continue

            if uuid not in node_dict:
                continue

            node_power_state = node_dict[uuid].power_state
            if db_instance.power_state != node_power_state:
                LOG.debug('Triggering sync for uuid %s', uuid)
                self._syncs_in_progress[uuid] = True
                self._sync_power_pool.spawn_n(_sync, db_instance,
                                              node_power_state)

    def _sync_instance_power_state(self, context, db_instance,
                                   node_power_state):
        """Align instance power state between the database and hypervisor.

        If the instance is not found on the hypervisor, but is in the database,
        then a stop() API will be called on the instance.
        """

        # We re-query the DB to get the latest instance info to minimize
        # (not eliminate) race condition.
        db_instance.refresh()
        db_power_state = db_instance.power_state

        if db_instance.status not in (states.ACTIVE, states.STOPPED):
            # on the receiving end of mogan-engine, it could happen
            # that the DB instance already report the new resident
            # but the actual BM has not showed up on the hypervisor
            # yet. In this case, let's allow the loop to continue
            # and run the state sync in a later round
            LOG.info("During sync_power_state the instance has a "
                     "pending task (%(task)s). Skip.",
                     {'task': db_instance.task_state},
                     instance=db_instance)
            return

        if node_power_state != db_power_state:
            LOG.info('During _sync_instance_power_state the DB '
                     'power_state (%(db_power_state)s) does not match '
                     'the node_power_state from the hypervisor '
                     '(%(node_power_state)s). Updating power_state in the '
                     'DB to match the hypervisor.',
                     {'db_power_state': db_power_state,
                      'node_power_state': node_power_state},
                     instance=db_instance)
            # power_state is always updated from hypervisor to db
            db_instance.power_state = node_power_state
            db_instance.save()

    @periodic_task.periodic_task(spacing=CONF.engine.sync_maintenance_interval,
                                 run_immediately=True)
    def _sync_maintenance_states(self, context):
        """Align maintenance states between the database and the hypervisor."""

        try:
            nodes = self.driver.get_maintenance_node_list()
        except Exception as e:
            LOG.warning(
                "Failed to retrieve node list when synchronizing "
                "maintenance states: %(msg)s" % {"msg": e})
            # Just retrun if we fail to get nodes maintenance state.
            return

        node_dict = {node.instance_uuid: node for node in nodes}

        if not node_dict:
            LOG.warning("While synchronizing instance maintenance states, "
                        "found none node with instance associated on the "
                        "hypervisor.")
            return

        db_instances = objects.Instance.list(context)
        for instance in db_instances:
            uuid = instance.uuid

            # If instance in unstable states and the node goes to maintenance,
            # just skip the syncing process as the pending task should be goes
            # to error state instead.
            if instance.status in states.UNSTABLE_STATES:
                LOG.info("During sync_maintenance_state the instance "
                         "has a pending task (%(task)s). Skip.",
                         {'task': instance.status},
                         instance=instance)
                continue

            if uuid not in node_dict:
                continue

            node_maintenance = node_dict[uuid].maintenance

            if instance.status == states.MAINTENANCE and not node_maintenance:
                # TODO(zhenguo): need to check whether we need states machine
                # transition here, and currently we just move to ACTIVE state
                # regardless of it's real power state which may need sync power
                # state periodic task to correct it.
                instance.status = states.ACTIVE
                instance.save()
            elif node_maintenance and instance.status != states.MAINTENANCE:
                instance.status = states.MAINTENANCE
                instance.save()

    def destroy_networks(self, context, instance):
        ports = instance.nics.get_port_ids()
        for port in ports:
            self.network_api.delete_port(context, port, instance.uuid)

    def _unplug_vifs(self, context, instance):
        LOG.debug("unplug: instance_uuid=%(uuid)s vif=%(instance_nics)s",
                  {'uuid': instance.uuid,
                   'instance_nics': str(instance.nics)})

        bm_interface = self.driver.get_ports_from_node(instance.node_uuid)

        for pif in bm_interface:
            self.driver.unplug_vif(pif)

    def create_instance(self, context, instance, requested_networks,
                        request_spec=None, filter_properties=None):
        """Perform a deployment."""
        LOG.debug("Starting instance...", instance=instance)
        notifications.notify_about_instance_action(
            context, instance, self.host,
            action=fields.NotificationAction.CREATE,
            phase=fields.NotificationPhase.START)

        fsm = utils.get_state_machine(start_state=instance.status,
                                      target_state=states.ACTIVE)

        if filter_properties is None:
            filter_properties = {}

        try:
            node = self.scheduler_rpcapi.select_destinations(
                context, request_spec, filter_properties)
            instance.node_uuid = node['node_uuid']
            instance.save()
        except Exception as e:
            utils.process_event(fsm, instance, event='error')
            LOG.error("Created instance %(uuid)s failed."
                      "Exception: %(exception)s",
                      {"uuid": instance.uuid,
                       "exception": e})
            return

        try:
            flow_engine = create_instance.get_flow(
                context,
                self,
                instance,
                requested_networks,
                request_spec,
                filter_properties,
            )
        except Exception:
            utils.process_event(fsm, instance, event='error')
            msg = _("Create manager instance flow failed.")
            LOG.exception(msg)
            raise exception.MoganException(msg)

        def _run_flow():
            # This code executes create instance flow. If something goes wrong,
            # flow reverts all job that was done and reraises an exception.
            # Otherwise, all data that was generated by flow becomes available
            # in flow engine's storage.
            with flow_utils.DynamicLogListener(flow_engine, logger=LOG):
                flow_engine.run()

        try:
            _run_flow()
        except Exception as e:
            instance.power_state = states.NOSTATE
            utils.process_event(fsm, instance, event='error')
            LOG.error("Created instance %(uuid)s failed."
                      "Exception: %(exception)s",
                      {"uuid": instance.uuid,
                       "exception": e})
        else:
            # Advance the state model for the given event. Note that this
            # doesn't alter the instance in any way. This may raise
            # InvalidState, if this event is not allowed in the current state.
            instance.power_state = self.driver.get_power_state(context,
                                                               instance.uuid)
            instance.launched_at = timeutils.utcnow()
            utils.process_event(fsm, instance, event='done')
            LOG.info("Created instance %s successfully.", instance.uuid)
        finally:
            return instance

    def _delete_instance(self, context, instance):
        """Delete an instance

        :param context: mogan request context
        :param instance: instance object
        """
        # TODO(zhenguo): Add delete notification

        self.driver.destroy(context, instance)

    def delete_instance(self, context, instance):
        """Delete an instance."""
        LOG.debug("Deleting instance...")

        fsm = utils.get_state_machine(start_state=instance.status,
                                      target_state=states.DELETED)

        @utils.synchronized(instance.uuid)
        def do_delete_instance(instance):
            try:
                self._delete_instance(context, instance)
                self._unplug_vifs(context, instance)
            except exception.InstanceNotFound:
                LOG.info("Instance disappeared during terminate",
                         instance=instance)
            except Exception:
                # As we're trying to delete always go to Error if something
                # goes wrong that _delete_instance can't handle.
                with excutils.save_and_reraise_exception():
                    LOG.exception('Setting instance status to ERROR',
                                  instance=instance)
                    instance.power_state = states.NOSTATE
                    utils.process_event(fsm, instance, event='error')

        do_delete_instance(instance)

        instance.power_state = states.NOSTATE
        utils.process_event(fsm, instance, event='done')
        instance.destroy()

    def set_power_state(self, context, instance, state):
        """Set power state for the specified instance."""

        fsm = utils.get_state_machine(start_state=instance.status)

        @utils.synchronized(instance.uuid)
        def do_set_power_state():
            LOG.debug('Power %(state)s called for instance %(instance)s',
                      {'state': state,
                       'instance': instance})
            self.driver.set_power_state(context, instance, state)

        do_set_power_state()
        instance.power_state = self.driver.get_power_state(context,
                                                           instance.uuid)
        utils.process_event(fsm, instance, event='done')
        LOG.info('Successfully set node power state: %s',
                 state, instance=instance)

    def _rebuild_instance(self, context, instance):
        """Perform rebuild action on the specified instance."""

        # TODO(zhenguo): Add delete notification

        self.driver.rebuild(context, instance)

    def rebuild_instance(self, context, instance):
        """Destroy and re-make this instance.

        :param context: mogan request context
        :param instance: instance object
        """

        LOG.debug('Rebuilding instance', instance=instance)

        fsm = utils.get_state_machine(start_state=instance.status)

        try:
            self._rebuild_instance(context, instance)
        except Exception as e:
            utils.process_event(fsm, instance, event='error')
            LOG.error("Rebuild instance %(uuid)s failed."
                      "Exception: %(exception)s",
                      {"uuid": instance.uuid,
                       "exception": e})
            return

        utils.process_event(fsm, instance, event='done')
        LOG.info('Instance was successfully rebuilt', instance=instance)

    def list_availability_zones(self, context):
        """Get availability zone list."""
        compute_nodes = objects.ComputeNodeList.get_all_available(context)

        azs = set()
        for node in compute_nodes:
            az = node.availability_zone \
                or CONF.engine.default_availability_zone
            if az is not None:
                azs.add(az)

        return {'availability_zones': list(azs)}