fenix/fenix/workflow/workflows/default.py

# Copyright (c) 2018 OpenStack Foundation.
# All Rights Reserved.
#
#    Licensed under the Apache License, Version 2.0 (the "License"); you may
#    not use this file except in compliance with the License. You may obtain
#    a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#    Unless required by applicable law or agreed to in writing, software
#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
#    License for the specific language governing permissions and limitations
#    under the License.
import datetime
from importlib import import_module
try:
    from importlib.machinery import SourceFileLoader

    def mod_loader_action_instance(mname, mpath, session_instance,
                                   ap_db_instance):
        mi = SourceFileLoader(mname, mpath).load_module()
        return mi.ActionPlugin(session_instance, ap_db_instance)
except ImportError:
    from imp import load_source

    def mod_loader_action_instance(mname, mpath, session_instance,
                                   ap_db_instance):
        mi = load_source(mname, mpath)
        return mi.ActionPlugin(session_instance, ap_db_instance)

from novaclient import API_MAX_VERSION as nova_max_version
import novaclient.client as novaclient
from novaclient.exceptions import BadRequest

import os
from oslo_log import log as logging
import time

from fenix.db import api as db_api
from fenix.utils.time import datetime_to_str
from fenix.utils.time import is_time_after_time
from fenix.utils.time import reply_time_str
from fenix.utils.time import time_now_str


from fenix.workflow.workflow import BaseWorkflow

LOG = logging.getLogger(__name__)


class Workflow(BaseWorkflow):

    def __init__(self, conf, session_id, data):
        super(Workflow, self).__init__(conf, session_id, data)
        nova_version = 2.53
        self.nova = novaclient.Client(nova_version, session=self.auth_session)
        max_nova_server_ver = float(self.nova.versions.get_current().version)
        max_nova_client_ver = float(nova_max_version.get_string())
        if max_nova_server_ver > 2.53 and max_nova_client_ver > 2.53:
            if max_nova_client_ver <= max_nova_server_ver:
                nova_version = max_nova_client_ver
            else:
                nova_version = max_nova_server_ver
            self.nova = novaclient.Client(nova_version,
                                          session=self.auth_session)
        if not self.hosts:
            self.hosts = self._init_hosts_by_services()
        else:
            self._init_update_hosts()
        LOG.info("%s: initialized. Nova version %f" % (self.session_id,
                                                       nova_version))

        LOG.info('%s: Execute pre action plugins' % (self.session_id))
        self.maintenance_by_plugin_type("localhost", "pre")

    def _init_hosts_by_services(self):
        LOG.info("%s: Dicovering hosts by Nova services" % self.session_id)
        hosts = []
        hostnames = []
        controllers = self.nova.services.list(binary='nova-conductor')
        for controller in controllers:
            host = {}
            service_host = str(controller.__dict__.get(u'host'))
            if service_host in hostnames:
                continue
            host['hostname'] = service_host
            hostnames.append(service_host)
            host['type'] = 'controller'
            if str(controller.__dict__.get(u'status')) == 'disabled':
                LOG.error("%s: %s nova-conductor disabled before maintenance"
                          % (self.session_id, service_host))
                raise Exception("%s: %s already disabled"
                                % (self.session_id, service_host))
            host['disabled'] = False
            host['details'] = str(controller.__dict__.get(u'id'))
            host['maintained'] = False
            hosts.append(host)

        computes = self.nova.services.list(binary='nova-compute')
        for compute in computes:
            host = {}
            service_host = str(compute.__dict__.get(u'host'))
            host['hostname'] = service_host
            host['type'] = 'compute'
            if str(compute.__dict__.get(u'status')) == 'disabled':
                LOG.error("%s: %s nova-compute disabled before maintenance"
                          % (self.session_id, service_host))
                raise Exception("%s: %s already disabled"
                                % (self.session_id, service_host))
            host['disabled'] = False
            host['details'] = str(compute.__dict__.get(u'id'))
            host['maintained'] = False
            hosts.append(host)

        return db_api.create_hosts_by_details(self.session_id, hosts)

    def _init_update_hosts(self):
        LOG.info("%s: Update given hosts" % self.session_id)
        controllers = self.nova.services.list(binary='nova-conductor')
        computes = self.nova.services.list(binary='nova-compute')

        for host in self.hosts:
            hostname = host.hostname
            host.disabled = False
            host.maintained = False
            match = [compute for compute in computes if
                     hostname == compute.host]
            if match:
                host.type = 'compute'
                if match[0].status == 'disabled':
                    LOG.error("%s: %s nova-compute disabled before maintenance"
                              % (self.session_id, hostname))
                    raise Exception("%s: %s already disabled"
                                    % (self.session_id, hostname))
                host.details = match[0].id
                continue
            if ([controller for controller in controllers if
                 hostname == controller.host]):
                host.type = 'controller'
                continue
            host.type = 'other'

    def disable_host_nova_compute(self, hostname):
        LOG.info('%s: disable nova-compute on host %s' % (self.session_id,
                                                          hostname))
        host = self.get_host_by_name(hostname)
        try:
            self.nova.services.disable_log_reason(host.details, "maintenance")
        except TypeError:
            LOG.debug('%s: Using old API to disable nova-compute on host %s' %
                      (self.session_id, hostname))
            self.nova.services.disable_log_reason(hostname, "nova-compute",
                                                  "maintenance")
        host.disabled = True

    def enable_host_nova_compute(self, hostname):
        LOG.info('%s: enable nova-compute on host %s' % (self.session_id,
                                                         hostname))
        host = self.get_host_by_name(hostname)
        try:
            self.nova.services.enable(host.details)
        except TypeError:
            LOG.debug('%s: Using old API to enable nova-compute on host %s' %
                      (self.session_id, hostname))
            self.nova.services.enable(hostname, "nova-compute")
        host.disabled = False

    def get_compute_hosts(self):
        return [host.hostname for host in self.hosts
                if host.type == 'compute']

    def get_empty_computes(self):
        all_computes = self.get_compute_hosts()
        instance_computes = []
        for instance in self.instances:
            if instance.host not in instance_computes:
                instance_computes.append(instance.host)
        return [host for host in all_computes if host not in instance_computes]

    def get_instance_details(self, instance):
        network_interfaces = next(iter(instance.addresses.values()))
        for network_interface in network_interfaces:
            _type = network_interface.get('OS-EXT-IPS:type')
            if _type == "floating":
                LOG.info('Instance with floating ip: %s %s' %
                         (instance.id, instance.name))
                return "floating_ip"
        return None

    def _fenix_instance(self, project_id, instance_id, instance_name, host,
                        state, details, action=None, project_state=None,
                        action_done=False):
        instance = {'session_id': self.session_id,
                    'instance_id': instance_id,
                    'action': action,
                    'project_id': project_id,
                    'instance_id': instance_id,
                    'project_state': project_state,
                    'state': state,
                    'instance_name': instance_name,
                    'action_done': action_done,
                    'host': host,
                    'details': details}
        return instance

    def initialize_server_info(self):
        project_ids = []
        instances = []
        compute_hosts = self.get_compute_hosts()
        opts = {'all_tenants': True}
        servers = self.nova.servers.list(detailed=True, search_opts=opts)
        for server in servers:
            try:
                host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
                if host not in compute_hosts:
                    continue
                project_id = str(server.tenant_id)
                instance_name = str(server.name)
                instance_id = str(server.id)
                details = self.get_instance_details(server)
                state = str(server.__dict__.get('OS-EXT-STS:vm_state'))
            except Exception:
                raise Exception('can not get params from server=%s' % server)
            instances.append(self._fenix_instance(project_id, instance_id,
                                                  instance_name, host, state,
                                                  details))
            if project_id not in project_ids:
                project_ids.append(project_id)

        if len(project_ids):
            self.projects = self.init_projects(project_ids)
        else:
            LOG.info('%s: No projects on computes under maintenance %s' %
                     self.session_id)
        if len(instances):
            self.instances = self.add_instances(instances)
        else:
            LOG.info('%s: No instances on computes under maintenance %s' %
                     self.session_id)
        LOG.info(str(self))

    def update_instance(self, project_id, instance_id, instance_name, host,
                        state, details):
        if self.instance_id_found(instance_id):
            # TBD Might need to update instance variables here if not done
            # somewhere else
            return
        elif self.instance_name_found(instance_name):
            # Project has made re-instantiation, remove old add new
            old_instance = self.instance_by_name(instance_name)
            instance = self._fenix_instance(project_id, instance_id,
                                            instance_name, host,
                                            state, details,
                                            old_instance.action,
                                            old_instance.project_state,
                                            old_instance.action_done)
            self.instances.append(self.add_instance(instance))
            self.remove_instance(old_instance)
        else:
            # Instance new, as project has added instances
            instance = self._fenix_instance(project_id, instance_id,
                                            instance_name, host,
                                            state, details)
            self.instances.append(self.add_instance(instance))

    def remove_non_existing_instances(self, instance_ids):
        remove_instances = [instance for instance in
                            self.instances if instance.instance_id not in
                            instance_ids]
        for instance in remove_instances:
            # Instance deleted, as project possibly scaled down
            self.remove_instance(instance)

    def update_server_info(self):
        # TBD This keeps internal instance information up-to-date and prints
        # it out. Same could be done by updating the information when changed
        # Anyhow this also double checks information against Nova
        instance_ids = []
        compute_hosts = self.get_compute_hosts()
        opts = {'all_tenants': True}
        servers = self.nova.servers.list(detailed=True, search_opts=opts)
        for server in servers:
            try:
                host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
                if host not in compute_hosts:
                    continue
                project_id = str(server.tenant_id)
                instance_name = str(server.name)
                instance_id = str(server.id)
                details = self.get_instance_details(server)
                state = str(server.__dict__.get('OS-EXT-STS:vm_state'))
            except Exception:
                raise Exception('can not get params from server=%s' % server)
            self.update_instance(project_id, instance_id, instance_name, host,
                                 state, details)
            instance_ids.append(instance_id)
        self.remove_non_existing_instances(instance_ids)

        LOG.info(str(self))

    def confirm_maintenance(self):
        allowed_actions = []
        actions_at = self.session.maintenance_at
        state = 'MAINTENANCE'
        self.set_projets_state(state)
        all_replied = False
        project_not_replied = None
        retry = 2
        while not all_replied:
            for project in self.project_names():
                if (project_not_replied is not None and project not in
                        project_not_replied):
                    continue
                LOG.info('\nMAINTENANCE to project %s\n' % project)
                instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
                                                            self.session_id,
                                                            project)
                reply_at = reply_time_str(self.conf.project_maintenance_reply)
                if is_time_after_time(reply_at, actions_at):
                    LOG.error('%s: No time for project to answer in state: %s'
                              % (self.session_id, state))
                    self.session.state = "MAINTENANCE_FAILED"
                    return False
                metadata = self.session.meta
                self._project_notify(project, instance_ids, allowed_actions,
                                     actions_at, reply_at, state, metadata)
            self.start_timer(self.conf.project_maintenance_reply,
                             'MAINTENANCE_TIMEOUT')

            all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
            if not all_replied:
                if retry == 0:
                    LOG.info('confirm_maintenance failed after retries')
                    break
                else:
                    LOG.info('confirm_maintenance retry')
                    projects = self.get_projects_with_state()
                    project_not_replied = (
                        self._project_names_in_state(projects, state))
            retry -= 1
        return all_replied

    def confirm_scale_in(self):
        allowed_actions = []
        actions_at = reply_time_str(self.conf.project_scale_in_reply)
        reply_at = actions_at
        state = 'SCALE_IN'
        self.set_projets_state(state)
        all_replied = False
        project_not_replied = None
        retry = 2
        while not all_replied:
            for project in self.project_names():
                if (project_not_replied is not None and project not in
                        project_not_replied):
                    continue
                LOG.info('\nSCALE_IN to project %s\n' % project)
                instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
                                                            self.session_id,
                                                            project)
                metadata = self.session.meta
                self._project_notify(project, instance_ids, allowed_actions,
                                     actions_at, reply_at, state, metadata)
            self.start_timer(self.conf.project_scale_in_reply,
                             'SCALE_IN_TIMEOUT')

            all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
            if not all_replied:
                if retry == 0:
                    LOG.info('confirm_scale_in failed after retries')
                    break
                else:
                    LOG.info('confirm_scale_in retry')
                    projects = self.get_projects_with_state()
                    project_not_replied = (
                        self._project_names_in_state(projects, state))
            retry -= 1
        return all_replied

    def need_scale_in(self):
        hvisors = self.nova.hypervisors.list(detailed=True)
        prev_vcpus = 0
        free_vcpus = 0
        prev_hostname = ''
        LOG.info('checking hypervisors for VCPU capacity')
        for hvisor in hvisors:
            hostname = hvisor.__getattr__('hypervisor_hostname')
            if hostname not in self.get_compute_hosts():
                continue
            vcpus = hvisor.__getattr__('vcpus')
            vcpus_used = hvisor.__getattr__('vcpus_used')
            if prev_vcpus != 0 and prev_vcpus != vcpus:
                raise Exception('%s: %d vcpus on %s does not match to'
                                '%d on %s'
                                % (self.session_id, vcpus, hostname,
                                   prev_vcpus, prev_hostname))
            free_vcpus += vcpus - vcpus_used
            prev_vcpus = vcpus
            prev_hostname = hostname
        if free_vcpus >= vcpus:
            # TBD vcpu capacity might be too scattered so moving instances from
            # one host to other host still might not succeed.
            return False
        else:
            return True

    def get_free_vcpus_by_host(self, host, hvisors):
        hvisor = ([h for h in hvisors if
                  h.__getattr__('hypervisor_hostname') == host][0])
        vcpus = hvisor.__getattr__('vcpus')
        vcpus_used = hvisor.__getattr__('vcpus_used')
        return vcpus - vcpus_used

    def find_host_to_be_empty(self):
        # Preferrably host with most free vcpus, no floating ip instances and
        # least instances altogether
        host_to_be_empty = None
        host_no_fip_instances = 0
        host_free_vcpus = 0
        hvisors = self.nova.hypervisors.list(detailed=True)
        for host in self.get_compute_hosts():
            free_vcpus = self.get_free_vcpus_by_host(host, hvisors)
            fip_instances = 0
            no_fip_instances = 0
            for project in self.project_names():
                for instance in (self.instances_by_host_and_project(host,
                                 project)):
                    if instance.details and "floating_ip" in instance.details:
                        fip_instances += 1
                    else:
                        no_fip_instances += 1
            LOG.info('%s has %d floating ip and %d other instances %s free '
                     'vcpus' % (host, fip_instances, no_fip_instances,
                                free_vcpus))
            if fip_instances == 0:
                # We do not want to choose host with floating ip instance
                if host_to_be_empty:
                    # We have host candidate, let's see if this is better
                    if free_vcpus > host_free_vcpus:
                        # Choose as most vcpus free
                        host_to_be_empty = host
                        host_no_fip_instances = no_fip_instances
                        host_free_vcpus = 0
                    elif free_vcpus == host_free_vcpus:
                        if no_fip_instances < host_no_fip_instances:
                            # Choose as most vcpus free and least instances
                            host_to_be_empty = host
                            host_no_fip_instances = no_fip_instances
                            host_free_vcpus = 0
                else:
                    # This is first host candidate
                    host_to_be_empty = host
                    host_no_fip_instances = no_fip_instances
                    host_free_vcpus = 0
        if not host_to_be_empty:
            # No best cadidate found, let's choose last host in loop
            host_to_be_empty = host
        LOG.info('host %s selected to be empty' % host_to_be_empty)
        # TBD It might yet not be possible to move instances away from this
        # host if other hosts has free vcpu capacity scattered. It should
        # checked if instances on this host fits to other hosts
        return host_to_be_empty

    def confirm_host_to_be_emptied(self, host, state):
        allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION']
        actions_at = reply_time_str(self.conf.project_maintenance_reply)
        reply_at = actions_at
        self.set_projects_state_and_hosts_instances(state, [host])
        all_replied = False
        project_not_replied = None
        retry = 2
        while not all_replied:
            for project in self.project_names():
                if not self.project_has_state_instances(project):
                    continue
                if (project_not_replied is not None and project not in
                        project_not_replied):
                    continue
                LOG.info('%s to project %s' % (state, project))

                instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
                                                            self.session_id,
                                                            project)
                metadata = self.session.meta
                self._project_notify(project, instance_ids, allowed_actions,
                                     actions_at, reply_at, state, metadata)
            self.start_timer(self.conf.project_maintenance_reply,
                             '%s_TIMEOUT' % state)
            all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
            if not all_replied:
                if retry == 0:
                    LOG.info('confirm_host_to_be_emptied failed after retries')
                    break
                else:
                    LOG.info('confirm_host_to_be_emptied retry')
                    projects = self.get_projects_with_state()
                    project_not_replied = (
                        self._project_names_in_state(projects, state))
            retry -= 1
        return all_replied

    def confirm_maintenance_complete(self):
        state = 'MAINTENANCE_COMPLETE'
        metadata = self.session.meta
        actions_at = reply_time_str(self.conf.project_scale_in_reply)
        reply_at = actions_at
        self.set_projets_state(state)
        all_replied = False
        project_not_replied = None
        retry = 2
        while not all_replied:
            for project in self.project_names():
                if (project_not_replied is not None and project not in
                        project_not_replied):
                    continue
                LOG.info('%s to project %s' % (state, project))
                instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
                                                            self.session_id,
                                                            project)
                allowed_actions = []
                self._project_notify(project, instance_ids, allowed_actions,
                                     actions_at, reply_at, state, metadata)
            self.start_timer(self.conf.project_scale_in_reply,
                             '%s_TIMEOUT' % state)

            all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
            if not all_replied:
                if retry == 0:
                    LOG.info('confirm_maintenance_complete failed after '
                             'retries')
                    break
                else:
                    LOG.info('confirm_maintenance_complete retry')
                    projects = self.get_projects_with_state()
                    project_not_replied = (
                        self._project_names_in_state(projects, state))
            retry -= 1
        return all_replied

    def notify_action_done(self, project, instance):
        instance_ids = [instance.instance_id]
        allowed_actions = []
        actions_at = None
        reply_at = None
        state = "INSTANCE_ACTION_DONE"
        instance.project_state = state
        metadata = "{}"
        self._project_notify(project, instance_ids, allowed_actions,
                             actions_at, reply_at, state, metadata)

    def actions_to_have_empty_host(self, host):
        # TBD these might be done parallel
        for project in self.proj_instance_actions.keys():
            instances = (
                self.instances_by_host_and_project(host, project))
            for instance in instances:
                instance.action = (self.instance_action_by_project_reply(
                                   project, instance.instance_id))
                LOG.info('Action %s instance %s ' % (instance.action,
                                                     instance.instance_id))
                if instance.action == 'MIGRATE':
                    if not self.migrate_server(instance):
                        return False
                    self.notify_action_done(project, instance)
                elif instance.action == 'OWN_ACTION':
                    pass
                elif instance.action == 'LIVE_MIGRATE':
                    if not self.live_migrate_server(instance):
                        return False
                    self.notify_action_done(project, instance)
                else:
                    raise Exception('%s: instance %s action '
                                    '%s not supported' %
                                    (self.session_id, instance.instance_id,
                                     instance.action))
        return self._wait_host_empty(host)

    def _wait_host_empty(self, host):
        hid = self.nova.hypervisors.search(host)[0].id
        vcpus_used_last = 0
        # wait 4min to get host emptys
        for j in range(48):
            hvisor = self.nova.hypervisors.get(hid)
            vcpus_used = hvisor.__getattr__('vcpus_used')
            if vcpus_used > 0:
                if vcpus_used != vcpus_used_last or vcpus_used_last == 0:
                    LOG.info('%s still has %d vcpus reserved. wait...'
                             % (host, vcpus_used))
                vcpus_used_last = vcpus_used
                time.sleep(5)
            else:
                LOG.info('%s empty' % host)
                return True
        LOG.info('%s host still not empty' % host)
        return False

    def live_migrate_server(self, instance):
        server_id = instance.instance_id
        server = self.nova.servers.get(server_id)
        instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
        orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
        LOG.info('live_migrate_server %s state %s host %s' % (server_id,
                                                              instance.state,
                                                              orig_host))
        orig_vm_state = instance.state
        last_vm_status = str(server.__dict__.get('status'))
        last_migration_status = "active"
        try:
            server.live_migrate()
            waited = 0
            migrate_retries = 0
            while waited != self.conf.live_migration_wait_time:
                time.sleep(1)
                server = self.nova.servers.get(server_id)
                host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
                vm_status = str(server.__dict__.get('status'))
                instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
                instance.host = host
                if vm_status != last_vm_status:
                    LOG.info('instance %s status changed: %s' % (server_id,
                             vm_status))
                if instance.state == 'error':
                    LOG.error('instance %s live migration failed'
                              % server_id)
                    return False
                elif orig_vm_state != instance.state:
                    LOG.info('instance %s state changed: %s' % (server_id,
                             instance.state))
                elif host != orig_host:
                    LOG.info('instance %s live migrated to host %s' %
                             (server_id, host))
                    return True
                migration = (
                    self.nova.migrations.list(instance_uuid=server_id)[0])
                if migration.status == 'error':
                    if migrate_retries == self.conf.live_migration_retries:
                        LOG.error('instance %s live migration failed after '
                                  '%d retries' %
                                  (server_id,
                                   self.conf.live_migration_retries))
                        return False
                    # When live migrate fails it can fail fast after calling
                    # To have Nova time to be ready for next live migration
                    # There needs to be enough time to wait before retry
                    # And waiting more on next retry have better chance to
                    # Have live migration finally through
                    time.sleep(2 * (migrate_retries + 5))
                    LOG.info('instance %s live migration failed, retry'
                             % server_id)
                    server.live_migrate()
                    waited = 0
                    migrate_retries = migrate_retries + 1
                elif migration.status != last_migration_status:
                    LOG.info('instance %s live migration status changed: %s'
                             % (server_id, migration.status))
                waited = waited + 1
                last_migration_status = migration.status
                last_vm_status = vm_status
            LOG.error('instance %s live migration did not finish in %ss, '
                      'state: %s' % (server_id, waited, instance.state))
        except Exception as e:
            LOG.error('server %s live migration failed, Exception=%s' %
                      (server_id, e))
        return False

    def migrate_server(self, instance):
        # TBD this method should be enhanced for errors and to have failed
        # instance back to state active instead of error
        server_id = instance.instance_id
        server = self.nova.servers.get(server_id)
        instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
        orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
        LOG.info('migrate_server %s state %s host %s' % (server_id,
                                                         instance.state,
                                                         orig_host))
        last_vm_state = instance.state
        retry_migrate = 2
        while True:
            try:
                server.migrate()
                time.sleep(5)
                retries = 48
                while instance.state != 'resized' and retries > 0:
                    # try to confirm within 4min
                    server = self.nova.servers.get(server_id)
                    host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
                    instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
                    if instance.state == 'resized':
                        server.confirm_resize()
                        LOG.info('instance %s migration resized to host %s' %
                                 (server_id, host))
                        instance.host = host
                        return True
                    if last_vm_state != instance.state:
                        LOG.info('instance %s state changed: %s' % (server_id,
                                 instance.state))
                    if instance.state == 'error':
                        LOG.error('instance %s migration failed, state: %s'
                                  % (server_id, instance.state))
                        instance.host = host
                        return False
                    time.sleep(5)
                    retries = retries - 1
                    last_vm_state = instance.state
                # Timout waiting state to change
                break

            except BadRequest:
                if retry_migrate == 0:
                    LOG.error('server %s migrate failed after retries' %
                              server_id)
                    return False
                # Might take time for scheduler to sync inconsistent instance
                # list for host
                # TBD Retry doesn't help, need investigating if reproduces
                retry_timeout = 150 - (retry_migrate * 60)
                LOG.info('server %s migrate failed, retry in %s sec'
                         % (server_id, retry_timeout))
                time.sleep(retry_timeout)
            except Exception as e:
                LOG.error('server %s migration failed, Exception=%s' %
                          (server_id, e))
                return False
            finally:
                retry_migrate = retry_migrate - 1
        LOG.error('instance %s migration timeout, state: %s' %
                  (server_id, instance.state))
        return False

    def maintenance_by_plugin_type(self, hostname, plugin_type):
        aps = self.get_action_plugins_by_type(plugin_type)
        session_dir = "%s/%s" % (self.conf.engine.local_cache_dir,
                                 self.session_id)
        download_plugin_dir = session_dir + "/actions/"
        if aps:
            LOG.info("%s: Calling action plug-ins with type %s" %
                     (self.session_id, plugin_type))
            for ap in aps:
                ap_name = "fenix.workflow.actions.%s" % ap.plugin
                LOG.info("%s: Calling action plug-in module: %s" %
                         (self.session_id, ap_name))
                ap_db_instance = self._create_action_plugin_instance(ap.plugin,
                                                                     hostname)
                try:
                    action_plugin = getattr(import_module(ap_name),
                                            'ActionPlugin')
                    ap_instance = action_plugin(self, ap_db_instance)
                except ImportError:
                    download_plugin_file = "%s/%s.py" % (download_plugin_dir,
                                                         ap.plugin)
                    LOG.info("%s: Trying from: %s" % (self.session_id,
                                                      download_plugin_file))
                    if os.path.isfile(download_plugin_file):
                        ap_instance = (
                            mod_loader_action_instance(ap_name,
                                                       download_plugin_file,
                                                       self,
                                                       ap_db_instance))
                    else:
                        raise Exception('%s: could not find action plugin %s' %
                                        (self.session_id, ap.plugin))

                ap_instance.run()
                if ap_db_instance.state:
                    LOG.info('%s: %s finished with %s host %s' %
                             (self.session_id, ap.plugin,
                              ap_db_instance.state, hostname))
                    if 'FAILED' in ap_db_instance.state:
                        raise Exception('%s: %s finished with %s host %s' %
                                        (self.session_id, ap.plugin,
                                         ap_db_instance.state, hostname))
                else:
                    raise Exception('%s: %s reported no state for host %s' %
                                    (self.session_id, ap.plugin, hostname))
                # If ap_db_instance failed, we keep it for state
                db_api.remove_action_plugin_instance(ap_db_instance)
        else:
            LOG.info("%s: No action plug-ins with type %s" %
                     (self.session_id, plugin_type))

    def host_maintenance(self, hostname):
        host = self.get_host_by_name(hostname)
        LOG.info('%s: Maintaining host %s' % (self.session_id, hostname))
        for plugin_type in ["host", host.type]:
            LOG.info('%s: Execute %s action plugins' % (self.session_id,
                                                        plugin_type))
            self.maintenance_by_plugin_type(hostname, plugin_type)
        LOG.info('%s: Maintaining host %s complete' % (self.session_id,
                                                       hostname))

    def maintenance(self):
        LOG.info("%s: maintenance called" % self.session_id)
        self.initialize_server_info()

        if not self.projects_listen_alarm('maintenance.scheduled'):
            self.session.state = 'MAINTENANCE_FAILED'
            return

        if not self.confirm_maintenance():
            self.session.state = 'MAINTENANCE_FAILED'
            return

        maintenance_empty_hosts = self.get_empty_computes()

        if len(maintenance_empty_hosts) == 0:
            if self.need_scale_in():
                LOG.info('%s: Need to scale in to get capacity for '
                         'empty host' % (self.session_id))
                self.session.state = 'SCALE_IN'
            else:
                LOG.info('%s: Free capacity, but need empty host' %
                         (self.session_id))
                self.session.state = 'PREPARE_MAINTENANCE'
        else:
            LOG.info('Empty host found')
            self.session.state = 'START_MAINTENANCE'

        if self.session.maintenance_at > datetime.datetime.utcnow():
            time_now = time_now_str()
            LOG.info('Time now: %s maintenance starts: %s....' %
                     (time_now, datetime_to_str(self.session.maintenance_at)))
            td = self.session.maintenance_at - datetime.datetime.utcnow()
            self.start_timer(td.total_seconds(), 'MAINTENANCE_START_TIMEOUT')
            while not self.is_timer_expired('MAINTENANCE_START_TIMEOUT'):
                time.sleep(1)

        time_now = time_now_str()
        LOG.info('Time to start maintenance: %s' % time_now)

    def scale_in(self):
        LOG.info("%s: scale in" % self.session_id)

        if not self.confirm_scale_in():
            self.session.state = 'MAINTENANCE_FAILED'
            return
        # TBD it takes time to have proper information updated about free
        # capacity. Should make sure instances removed has also VCPUs removed
        self.update_server_info()
        maintenance_empty_hosts = self.get_empty_computes()

        if len(maintenance_empty_hosts) == 0:
            if self.need_scale_in():
                LOG.info('%s: Need to scale in more to get capacity for '
                         'empty host' % (self.session_id))
                self.session.state = 'SCALE_IN'
            else:
                LOG.info('%s: Free capacity, but need empty host' %
                         (self.session_id))
                self.session.state = 'PREPARE_MAINTENANCE'
        else:
            LOG.info('Empty host found')
            self.session.state = 'START_MAINTENANCE'

    def prepare_maintenance(self):
        LOG.info("%s: prepare_maintenance called" % self.session_id)
        host = self.find_host_to_be_empty()
        if not self.confirm_host_to_be_emptied(host, 'PREPARE_MAINTENANCE'):
            self.session.state = 'MAINTENANCE_FAILED'
            return
        if not self.actions_to_have_empty_host(host):
            # TBD we found the hard way that we couldn't make host empty and
            # need to scale in more. Thigns might fail after this if any
            # instance if error or Nova scheduler cached data corrupted for
            # what instance on which host
            LOG.info('%s: Failed to empty %s. Need to scale in more to get '
                     'capacity for empty host' % (self.session_id, host))
            self.session.state = 'SCALE_IN'
        else:
            self.session.state = 'START_MAINTENANCE'
        self.update_server_info()

    def start_maintenance(self):
        LOG.info("%s: start_maintenance called" % self.session_id)
        empty_hosts = self.get_empty_computes()
        if not empty_hosts:
            LOG.info("%s: No empty host to be maintained" % self.session_id)
            self.session.state = 'MAINTENANCE_FAILED'
            return
        maintained_hosts = self.get_maintained_hosts_by_type('compute')
        if not maintained_hosts:
            computes = self.get_compute_hosts()
            for compute in computes:
                # When we start to maintain compute hosts, all these hosts
                # nova-compute service is disabled, so projects cannot have
                # instances scheduled to not maintained hosts
                self.disable_host_nova_compute(compute)
            for host in self.get_controller_hosts():
                LOG.info('IN_MAINTENANCE controller %s' % host)
                self._admin_notify(self.conf.service_user.os_project_name,
                                   host,
                                   'IN_MAINTENANCE',
                                   self.session_id)
                self.host_maintenance(host)
                self._admin_notify(self.conf.service_user.os_project_name,
                                   host,
                                   'MAINTENANCE_COMPLETE',
                                   self.session_id)
                LOG.info('MAINTENANCE_COMPLETE controller %s' % host)
                self.host_maintained(host)
            # First we maintain all empty hosts
            for host in empty_hosts:
                # TBD we wait host VCPUs to report right, but this is not
                # correct place. We should handle this after scale in
                # also this could be made parallel if more than one empty host
                self._wait_host_empty(host)

                LOG.info('IN_MAINTENANCE compute %s' % host)
                self._admin_notify(self.conf.service_user.os_project_name,
                                   host,
                                   'IN_MAINTENANCE',
                                   self.session_id)
                self.host_maintenance(host)
                self._admin_notify(self.conf.service_user.os_project_name,
                                   host,
                                   'MAINTENANCE_COMPLETE',
                                   self.session_id)

                self.enable_host_nova_compute(host)
                LOG.info('MAINTENANCE_COMPLETE compute %s' % host)
                self.host_maintained(host)
        else:
            # Now we maintain hosts gone trough PLANNED_MAINTENANCE
            hosts = [h for h in empty_hosts if h not in maintained_hosts]
            for host in hosts:
                # TBD this could be made parallel if more than one empty host
                self._wait_host_empty(host)

                LOG.info('IN_MAINTENANCE host %s' % host)
                self._admin_notify(self.conf.service_user.os_project_name,
                                   host,
                                   'IN_MAINTENANCE',
                                   self.session_id)
                self.host_maintenance(host)
                self._admin_notify(self.conf.service_user.os_project_name,
                                   host,
                                   'MAINTENANCE_COMPLETE',
                                   self.session_id)

                self.enable_host_nova_compute(host)
                LOG.info('MAINTENANCE_COMPLETE host %s' % host)
                self.host_maintained(host)
        maintained_hosts = self.get_maintained_hosts_by_type('compute')
        if len(maintained_hosts) != len(self.get_compute_hosts()):
            # Not all host maintained
            self.session.state = 'PLANNED_MAINTENANCE'
        else:
            self.session.state = 'MAINTENANCE_COMPLETE'

    def planned_maintenance(self):
        LOG.info("%s: planned_maintenance called" % self.session_id)
        maintained_hosts = self.get_maintained_hosts_by_type('compute')
        compute_hosts = self.get_compute_hosts()
        not_maintained_hosts = ([host for host in compute_hosts if host
                                 not in maintained_hosts])
        LOG.info("%s: Not maintained hosts: %s" % (self.session_id,
                                                   not_maintained_hosts))
        host = not_maintained_hosts[0]
        if not self.confirm_host_to_be_emptied(host, 'PLANNED_MAINTENANCE'):
            self.session.state = 'MAINTENANCE_FAILED'
            return
        if not self.actions_to_have_empty_host(host):
            # Failure in here might indicate action to move instance failed.
            # This might be as Nova VCPU capacity was not yet emptied from
            # expected target hosts
            self.session.state = 'MAINTENANCE_FAILED'
            return
        self.update_server_info()
        self.session.state = 'START_MAINTENANCE'

    def maintenance_complete(self):
        LOG.info("%s: maintenance_complete called" % self.session_id)
        LOG.info('%s: Execute post action plugins' % self.session_id)
        self.maintenance_by_plugin_type("localhost", "post")
        LOG.info('Projects may still need to up scale back to full '
                 'capcity')
        if not self.confirm_maintenance_complete():
            self.session.state = 'MAINTENANCE_FAILED'
            return
        self.update_server_info()
        self.session.state = 'MAINTENANCE_DONE'

    def maintenance_done(self):
        pass

    def maintenance_failed(self):
        LOG.info("%s: maintenance_failed called" % self.session_id)

    def cleanup(self):
        LOG.info("%s: cleanup" % self.session_id)
        db_api.remove_session(self.session_id)