fenix/fenix/workflow/workflows/default.py

992 lines
46 KiB
Python

# Copyright (c) 2018 OpenStack Foundation.
# All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import datetime
from importlib import import_module
try:
from importlib.machinery import SourceFileLoader
def mod_loader_action_instance(mname, mpath, session_instance,
ap_db_instance):
mi = SourceFileLoader(mname, mpath).load_module()
return mi.ActionPlugin(session_instance, ap_db_instance)
except ImportError:
from imp import load_source
def mod_loader_action_instance(mname, mpath, session_instance,
ap_db_instance):
mi = load_source(mname, mpath)
return mi.ActionPlugin(session_instance, ap_db_instance)
from novaclient import API_MAX_VERSION as nova_max_version
import novaclient.client as novaclient
from novaclient.exceptions import BadRequest
import os
from oslo_log import log as logging
import time
from fenix.db import api as db_api
from fenix.utils.time import datetime_to_str
from fenix.utils.time import is_time_after_time
from fenix.utils.time import reply_time_str
from fenix.utils.time import time_now_str
from fenix.workflow.workflow import BaseWorkflow
LOG = logging.getLogger(__name__)
class Workflow(BaseWorkflow):
def __init__(self, conf, session_id, data):
super(Workflow, self).__init__(conf, session_id, data)
nova_version = 2.53
self.nova = novaclient.Client(nova_version, session=self.auth_session)
max_nova_server_ver = float(self.nova.versions.get_current().version)
max_nova_client_ver = float(nova_max_version.get_string())
if max_nova_server_ver > 2.53 and max_nova_client_ver > 2.53:
if max_nova_client_ver <= max_nova_server_ver:
nova_version = max_nova_client_ver
else:
nova_version = max_nova_server_ver
self.nova = novaclient.Client(nova_version,
session=self.auth_session)
if not self.hosts:
self.hosts = self._init_hosts_by_services()
else:
self._init_update_hosts()
LOG.info("%s: initialized. Nova version %f" % (self.session_id,
nova_version))
LOG.info('%s: Execute pre action plugins' % (self.session_id))
self.maintenance_by_plugin_type("localhost", "pre")
def _init_hosts_by_services(self):
LOG.info("%s: Dicovering hosts by Nova services" % self.session_id)
hosts = []
hostnames = []
controllers = self.nova.services.list(binary='nova-conductor')
for controller in controllers:
host = {}
service_host = str(controller.__dict__.get(u'host'))
if service_host in hostnames:
continue
host['hostname'] = service_host
hostnames.append(service_host)
host['type'] = 'controller'
if str(controller.__dict__.get(u'status')) == 'disabled':
LOG.error("%s: %s nova-conductor disabled before maintenance"
% (self.session_id, service_host))
raise Exception("%s: %s already disabled"
% (self.session_id, service_host))
host['disabled'] = False
host['details'] = str(controller.__dict__.get(u'id'))
host['maintained'] = False
hosts.append(host)
computes = self.nova.services.list(binary='nova-compute')
for compute in computes:
host = {}
service_host = str(compute.__dict__.get(u'host'))
host['hostname'] = service_host
host['type'] = 'compute'
if str(compute.__dict__.get(u'status')) == 'disabled':
LOG.error("%s: %s nova-compute disabled before maintenance"
% (self.session_id, service_host))
raise Exception("%s: %s already disabled"
% (self.session_id, service_host))
host['disabled'] = False
host['details'] = str(compute.__dict__.get(u'id'))
host['maintained'] = False
hosts.append(host)
return db_api.create_hosts_by_details(self.session_id, hosts)
def _init_update_hosts(self):
LOG.info("%s: Update given hosts" % self.session_id)
controllers = self.nova.services.list(binary='nova-conductor')
computes = self.nova.services.list(binary='nova-compute')
for host in self.hosts:
hostname = host.hostname
host.disabled = False
host.maintained = False
match = [compute for compute in computes if
hostname == compute.host]
if match:
host.type = 'compute'
if match[0].status == 'disabled':
LOG.error("%s: %s nova-compute disabled before maintenance"
% (self.session_id, hostname))
raise Exception("%s: %s already disabled"
% (self.session_id, hostname))
host.details = match[0].id
continue
if ([controller for controller in controllers if
hostname == controller.host]):
host.type = 'controller'
continue
host.type = 'other'
def disable_host_nova_compute(self, hostname):
LOG.info('%s: disable nova-compute on host %s' % (self.session_id,
hostname))
host = self.get_host_by_name(hostname)
try:
self.nova.services.disable_log_reason(host.details, "maintenance")
except TypeError:
LOG.debug('%s: Using old API to disable nova-compute on host %s' %
(self.session_id, hostname))
self.nova.services.disable_log_reason(hostname, "nova-compute",
"maintenance")
host.disabled = True
def enable_host_nova_compute(self, hostname):
LOG.info('%s: enable nova-compute on host %s' % (self.session_id,
hostname))
host = self.get_host_by_name(hostname)
try:
self.nova.services.enable(host.details)
except TypeError:
LOG.debug('%s: Using old API to enable nova-compute on host %s' %
(self.session_id, hostname))
self.nova.services.enable(hostname, "nova-compute")
host.disabled = False
def get_compute_hosts(self):
return [host.hostname for host in self.hosts
if host.type == 'compute']
def get_empty_computes(self):
all_computes = self.get_compute_hosts()
instance_computes = []
for instance in self.instances:
if instance.host not in instance_computes:
instance_computes.append(instance.host)
return [host for host in all_computes if host not in instance_computes]
def get_instance_details(self, instance):
network_interfaces = next(iter(instance.addresses.values()))
for network_interface in network_interfaces:
_type = network_interface.get('OS-EXT-IPS:type')
if _type == "floating":
LOG.info('Instance with floating ip: %s %s' %
(instance.id, instance.name))
return "floating_ip"
return None
def _fenix_instance(self, project_id, instance_id, instance_name, host,
state, details, action=None, project_state=None,
action_done=False):
instance = {'session_id': self.session_id,
'instance_id': instance_id,
'action': action,
'project_id': project_id,
'instance_id': instance_id,
'project_state': project_state,
'state': state,
'instance_name': instance_name,
'action_done': action_done,
'host': host,
'details': details}
return instance
def initialize_server_info(self):
project_ids = []
instances = []
compute_hosts = self.get_compute_hosts()
opts = {'all_tenants': True}
servers = self.nova.servers.list(detailed=True, search_opts=opts)
for server in servers:
try:
host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
if host not in compute_hosts:
continue
project_id = str(server.tenant_id)
instance_name = str(server.name)
instance_id = str(server.id)
details = self.get_instance_details(server)
state = str(server.__dict__.get('OS-EXT-STS:vm_state'))
except Exception:
raise Exception('can not get params from server=%s' % server)
instances.append(self._fenix_instance(project_id, instance_id,
instance_name, host, state,
details))
if project_id not in project_ids:
project_ids.append(project_id)
if len(project_ids):
self.projects = self.init_projects(project_ids)
else:
LOG.info('%s: No projects on computes under maintenance %s' %
self.session_id)
if len(instances):
self.instances = self.add_instances(instances)
else:
LOG.info('%s: No instances on computes under maintenance %s' %
self.session_id)
LOG.info(str(self))
def update_instance(self, project_id, instance_id, instance_name, host,
state, details):
if self.instance_id_found(instance_id):
# TBD Might need to update instance variables here if not done
# somewhere else
return
elif self.instance_name_found(instance_name):
# Project has made re-instantiation, remove old add new
old_instance = self.instance_by_name(instance_name)
instance = self._fenix_instance(project_id, instance_id,
instance_name, host,
state, details,
old_instance.action,
old_instance.project_state,
old_instance.action_done)
self.instances.append(self.add_instance(instance))
self.remove_instance(old_instance)
else:
# Instance new, as project has added instances
instance = self._fenix_instance(project_id, instance_id,
instance_name, host,
state, details)
self.instances.append(self.add_instance(instance))
def remove_non_existing_instances(self, instance_ids):
remove_instances = [instance for instance in
self.instances if instance.instance_id not in
instance_ids]
for instance in remove_instances:
# Instance deleted, as project possibly scaled down
self.remove_instance(instance)
def update_server_info(self):
# TBD This keeps internal instance information up-to-date and prints
# it out. Same could be done by updating the information when changed
# Anyhow this also double checks information against Nova
instance_ids = []
compute_hosts = self.get_compute_hosts()
opts = {'all_tenants': True}
servers = self.nova.servers.list(detailed=True, search_opts=opts)
for server in servers:
try:
host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
if host not in compute_hosts:
continue
project_id = str(server.tenant_id)
instance_name = str(server.name)
instance_id = str(server.id)
details = self.get_instance_details(server)
state = str(server.__dict__.get('OS-EXT-STS:vm_state'))
except Exception:
raise Exception('can not get params from server=%s' % server)
self.update_instance(project_id, instance_id, instance_name, host,
state, details)
instance_ids.append(instance_id)
self.remove_non_existing_instances(instance_ids)
LOG.info(str(self))
def confirm_maintenance(self):
allowed_actions = []
actions_at = self.session.maintenance_at
state = 'MAINTENANCE'
self.set_projets_state(state)
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('\nMAINTENANCE to project %s\n' % project)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
reply_at = reply_time_str(self.conf.project_maintenance_reply)
if is_time_after_time(reply_at, actions_at):
LOG.error('%s: No time for project to answer in state: %s'
% (self.session_id, state))
self.session.state = "MAINTENANCE_FAILED"
return False
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_maintenance_reply,
'MAINTENANCE_TIMEOUT')
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_maintenance failed after retries')
break
else:
LOG.info('confirm_maintenance retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def confirm_scale_in(self):
allowed_actions = []
actions_at = reply_time_str(self.conf.project_scale_in_reply)
reply_at = actions_at
state = 'SCALE_IN'
self.set_projets_state(state)
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('\nSCALE_IN to project %s\n' % project)
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_scale_in_reply,
'SCALE_IN_TIMEOUT')
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_scale_in failed after retries')
break
else:
LOG.info('confirm_scale_in retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def need_scale_in(self):
hvisors = self.nova.hypervisors.list(detailed=True)
prev_vcpus = 0
free_vcpus = 0
prev_hostname = ''
LOG.info('checking hypervisors for VCPU capacity')
for hvisor in hvisors:
hostname = hvisor.__getattr__('hypervisor_hostname')
if hostname not in self.get_compute_hosts():
continue
vcpus = hvisor.__getattr__('vcpus')
vcpus_used = hvisor.__getattr__('vcpus_used')
if prev_vcpus != 0 and prev_vcpus != vcpus:
raise Exception('%s: %d vcpus on %s does not match to'
'%d on %s'
% (self.session_id, vcpus, hostname,
prev_vcpus, prev_hostname))
free_vcpus += vcpus - vcpus_used
prev_vcpus = vcpus
prev_hostname = hostname
if free_vcpus >= vcpus:
# TBD vcpu capacity might be too scattered so moving instances from
# one host to other host still might not succeed.
return False
else:
return True
def get_free_vcpus_by_host(self, host, hvisors):
hvisor = ([h for h in hvisors if
h.__getattr__('hypervisor_hostname') == host][0])
vcpus = hvisor.__getattr__('vcpus')
vcpus_used = hvisor.__getattr__('vcpus_used')
return vcpus - vcpus_used
def find_host_to_be_empty(self):
# Preferrably host with most free vcpus, no floating ip instances and
# least instances altogether
host_to_be_empty = None
host_no_fip_instances = 0
host_free_vcpus = 0
hvisors = self.nova.hypervisors.list(detailed=True)
for host in self.get_compute_hosts():
free_vcpus = self.get_free_vcpus_by_host(host, hvisors)
fip_instances = 0
no_fip_instances = 0
for project in self.project_names():
for instance in (self.instances_by_host_and_project(host,
project)):
if instance.details and "floating_ip" in instance.details:
fip_instances += 1
else:
no_fip_instances += 1
LOG.info('%s has %d floating ip and %d other instances %s free '
'vcpus' % (host, fip_instances, no_fip_instances,
free_vcpus))
if fip_instances == 0:
# We do not want to choose host with floating ip instance
if host_to_be_empty:
# We have host candidate, let's see if this is better
if free_vcpus > host_free_vcpus:
# Choose as most vcpus free
host_to_be_empty = host
host_no_fip_instances = no_fip_instances
host_free_vcpus = 0
elif free_vcpus == host_free_vcpus:
if no_fip_instances < host_no_fip_instances:
# Choose as most vcpus free and least instances
host_to_be_empty = host
host_no_fip_instances = no_fip_instances
host_free_vcpus = 0
else:
# This is first host candidate
host_to_be_empty = host
host_no_fip_instances = no_fip_instances
host_free_vcpus = 0
if not host_to_be_empty:
# No best cadidate found, let's choose last host in loop
host_to_be_empty = host
LOG.info('host %s selected to be empty' % host_to_be_empty)
# TBD It might yet not be possible to move instances away from this
# host if other hosts has free vcpu capacity scattered. It should
# checked if instances on this host fits to other hosts
return host_to_be_empty
def confirm_host_to_be_emptied(self, host, state):
allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION']
actions_at = reply_time_str(self.conf.project_maintenance_reply)
reply_at = actions_at
self.set_projects_state_and_hosts_instances(state, [host])
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if not self.project_has_state_instances(project):
continue
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('%s to project %s' % (state, project))
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
metadata = self.session.meta
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_maintenance_reply,
'%s_TIMEOUT' % state)
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_host_to_be_emptied failed after retries')
break
else:
LOG.info('confirm_host_to_be_emptied retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def confirm_maintenance_complete(self):
state = 'MAINTENANCE_COMPLETE'
metadata = self.session.meta
actions_at = reply_time_str(self.conf.project_scale_in_reply)
reply_at = actions_at
self.set_projets_state(state)
all_replied = False
project_not_replied = None
retry = 2
while not all_replied:
for project in self.project_names():
if (project_not_replied is not None and project not in
project_not_replied):
continue
LOG.info('%s to project %s' % (state, project))
instance_ids = '%s/v1/maintenance/%s/%s' % (self.url,
self.session_id,
project)
allowed_actions = []
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
self.start_timer(self.conf.project_scale_in_reply,
'%s_TIMEOUT' % state)
all_replied = self.wait_projects_state(state, '%s_TIMEOUT' % state)
if not all_replied:
if retry == 0:
LOG.info('confirm_maintenance_complete failed after '
'retries')
break
else:
LOG.info('confirm_maintenance_complete retry')
projects = self.get_projects_with_state()
project_not_replied = (
self._project_names_in_state(projects, state))
retry -= 1
return all_replied
def notify_action_done(self, project, instance):
instance_ids = [instance.instance_id]
allowed_actions = []
actions_at = None
reply_at = None
state = "INSTANCE_ACTION_DONE"
instance.project_state = state
metadata = "{}"
self._project_notify(project, instance_ids, allowed_actions,
actions_at, reply_at, state, metadata)
def actions_to_have_empty_host(self, host):
# TBD these might be done parallel
for project in self.proj_instance_actions.keys():
instances = (
self.instances_by_host_and_project(host, project))
for instance in instances:
instance.action = (self.instance_action_by_project_reply(
project, instance.instance_id))
LOG.info('Action %s instance %s ' % (instance.action,
instance.instance_id))
if instance.action == 'MIGRATE':
if not self.migrate_server(instance):
return False
self.notify_action_done(project, instance)
elif instance.action == 'OWN_ACTION':
pass
elif instance.action == 'LIVE_MIGRATE':
if not self.live_migrate_server(instance):
return False
self.notify_action_done(project, instance)
else:
raise Exception('%s: instance %s action '
'%s not supported' %
(self.session_id, instance.instance_id,
instance.action))
return self._wait_host_empty(host)
def _wait_host_empty(self, host):
hid = self.nova.hypervisors.search(host)[0].id
vcpus_used_last = 0
# wait 4min to get host emptys
for j in range(48):
hvisor = self.nova.hypervisors.get(hid)
vcpus_used = hvisor.__getattr__('vcpus_used')
if vcpus_used > 0:
if vcpus_used != vcpus_used_last or vcpus_used_last == 0:
LOG.info('%s still has %d vcpus reserved. wait...'
% (host, vcpus_used))
vcpus_used_last = vcpus_used
time.sleep(5)
else:
LOG.info('%s empty' % host)
return True
LOG.info('%s host still not empty' % host)
return False
def live_migrate_server(self, instance):
server_id = instance.instance_id
server = self.nova.servers.get(server_id)
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
LOG.info('live_migrate_server %s state %s host %s' % (server_id,
instance.state,
orig_host))
orig_vm_state = instance.state
last_vm_status = str(server.__dict__.get('status'))
last_migration_status = "active"
try:
server.live_migrate()
waited = 0
migrate_retries = 0
while waited != self.conf.live_migration_wait_time:
time.sleep(1)
server = self.nova.servers.get(server_id)
host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
vm_status = str(server.__dict__.get('status'))
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
instance.host = host
if vm_status != last_vm_status:
LOG.info('instance %s status changed: %s' % (server_id,
vm_status))
if instance.state == 'error':
LOG.error('instance %s live migration failed'
% server_id)
return False
elif orig_vm_state != instance.state:
LOG.info('instance %s state changed: %s' % (server_id,
instance.state))
elif host != orig_host:
LOG.info('instance %s live migrated to host %s' %
(server_id, host))
return True
migration = (
self.nova.migrations.list(instance_uuid=server_id)[0])
if migration.status == 'error':
if migrate_retries == self.conf.live_migration_retries:
LOG.error('instance %s live migration failed after '
'%d retries' %
(server_id,
self.conf.live_migration_retries))
return False
# When live migrate fails it can fail fast after calling
# To have Nova time to be ready for next live migration
# There needs to be enough time to wait before retry
# And waiting more on next retry have better chance to
# Have live migration finally through
time.sleep(2 * (migrate_retries + 5))
LOG.info('instance %s live migration failed, retry'
% server_id)
server.live_migrate()
waited = 0
migrate_retries = migrate_retries + 1
elif migration.status != last_migration_status:
LOG.info('instance %s live migration status changed: %s'
% (server_id, migration.status))
waited = waited + 1
last_migration_status = migration.status
last_vm_status = vm_status
LOG.error('instance %s live migration did not finish in %ss, '
'state: %s' % (server_id, waited, instance.state))
except Exception as e:
LOG.error('server %s live migration failed, Exception=%s' %
(server_id, e))
return False
def migrate_server(self, instance):
# TBD this method should be enhanced for errors and to have failed
# instance back to state active instead of error
server_id = instance.instance_id
server = self.nova.servers.get(server_id)
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
orig_host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
LOG.info('migrate_server %s state %s host %s' % (server_id,
instance.state,
orig_host))
last_vm_state = instance.state
retry_migrate = 2
while True:
try:
server.migrate()
time.sleep(5)
retries = 48
while instance.state != 'resized' and retries > 0:
# try to confirm within 4min
server = self.nova.servers.get(server_id)
host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
instance.state = server.__dict__.get('OS-EXT-STS:vm_state')
if instance.state == 'resized':
server.confirm_resize()
LOG.info('instance %s migration resized to host %s' %
(server_id, host))
instance.host = host
return True
if last_vm_state != instance.state:
LOG.info('instance %s state changed: %s' % (server_id,
instance.state))
if instance.state == 'error':
LOG.error('instance %s migration failed, state: %s'
% (server_id, instance.state))
instance.host = host
return False
time.sleep(5)
retries = retries - 1
last_vm_state = instance.state
# Timout waiting state to change
break
except BadRequest:
if retry_migrate == 0:
LOG.error('server %s migrate failed after retries' %
server_id)
return False
# Might take time for scheduler to sync inconsistent instance
# list for host
# TBD Retry doesn't help, need investigating if reproduces
retry_timeout = 150 - (retry_migrate * 60)
LOG.info('server %s migrate failed, retry in %s sec'
% (server_id, retry_timeout))
time.sleep(retry_timeout)
except Exception as e:
LOG.error('server %s migration failed, Exception=%s' %
(server_id, e))
return False
finally:
retry_migrate = retry_migrate - 1
LOG.error('instance %s migration timeout, state: %s' %
(server_id, instance.state))
return False
def maintenance_by_plugin_type(self, hostname, plugin_type):
aps = self.get_action_plugins_by_type(plugin_type)
session_dir = "%s/%s" % (self.conf.engine.local_cache_dir,
self.session_id)
download_plugin_dir = session_dir + "/actions/"
if aps:
LOG.info("%s: Calling action plug-ins with type %s" %
(self.session_id, plugin_type))
for ap in aps:
ap_name = "fenix.workflow.actions.%s" % ap.plugin
LOG.info("%s: Calling action plug-in module: %s" %
(self.session_id, ap_name))
ap_db_instance = self._create_action_plugin_instance(ap.plugin,
hostname)
try:
action_plugin = getattr(import_module(ap_name),
'ActionPlugin')
ap_instance = action_plugin(self, ap_db_instance)
except ImportError:
download_plugin_file = "%s/%s.py" % (download_plugin_dir,
ap.plugin)
LOG.info("%s: Trying from: %s" % (self.session_id,
download_plugin_file))
if os.path.isfile(download_plugin_file):
ap_instance = (
mod_loader_action_instance(ap_name,
download_plugin_file,
self,
ap_db_instance))
else:
raise Exception('%s: could not find action plugin %s' %
(self.session_id, ap.plugin))
ap_instance.run()
if ap_db_instance.state:
LOG.info('%s: %s finished with %s host %s' %
(self.session_id, ap.plugin,
ap_db_instance.state, hostname))
if 'FAILED' in ap_db_instance.state:
raise Exception('%s: %s finished with %s host %s' %
(self.session_id, ap.plugin,
ap_db_instance.state, hostname))
else:
raise Exception('%s: %s reported no state for host %s' %
(self.session_id, ap.plugin, hostname))
# If ap_db_instance failed, we keep it for state
db_api.remove_action_plugin_instance(ap_db_instance)
else:
LOG.info("%s: No action plug-ins with type %s" %
(self.session_id, plugin_type))
def host_maintenance(self, hostname):
host = self.get_host_by_name(hostname)
LOG.info('%s: Maintaining host %s' % (self.session_id, hostname))
for plugin_type in ["host", host.type]:
LOG.info('%s: Execute %s action plugins' % (self.session_id,
plugin_type))
self.maintenance_by_plugin_type(hostname, plugin_type)
LOG.info('%s: Maintaining host %s complete' % (self.session_id,
hostname))
def maintenance(self):
LOG.info("%s: maintenance called" % self.session_id)
self.initialize_server_info()
if not self.projects_listen_alarm('maintenance.scheduled'):
self.session.state = 'MAINTENANCE_FAILED'
return
if not self.confirm_maintenance():
self.session.state = 'MAINTENANCE_FAILED'
return
maintenance_empty_hosts = self.get_empty_computes()
if len(maintenance_empty_hosts) == 0:
if self.need_scale_in():
LOG.info('%s: Need to scale in to get capacity for '
'empty host' % (self.session_id))
self.session.state = 'SCALE_IN'
else:
LOG.info('%s: Free capacity, but need empty host' %
(self.session_id))
self.session.state = 'PREPARE_MAINTENANCE'
else:
LOG.info('Empty host found')
self.session.state = 'START_MAINTENANCE'
if self.session.maintenance_at > datetime.datetime.utcnow():
time_now = time_now_str()
LOG.info('Time now: %s maintenance starts: %s....' %
(time_now, datetime_to_str(self.session.maintenance_at)))
td = self.session.maintenance_at - datetime.datetime.utcnow()
self.start_timer(td.total_seconds(), 'MAINTENANCE_START_TIMEOUT')
while not self.is_timer_expired('MAINTENANCE_START_TIMEOUT'):
time.sleep(1)
time_now = time_now_str()
LOG.info('Time to start maintenance: %s' % time_now)
def scale_in(self):
LOG.info("%s: scale in" % self.session_id)
if not self.confirm_scale_in():
self.session.state = 'MAINTENANCE_FAILED'
return
# TBD it takes time to have proper information updated about free
# capacity. Should make sure instances removed has also VCPUs removed
self.update_server_info()
maintenance_empty_hosts = self.get_empty_computes()
if len(maintenance_empty_hosts) == 0:
if self.need_scale_in():
LOG.info('%s: Need to scale in more to get capacity for '
'empty host' % (self.session_id))
self.session.state = 'SCALE_IN'
else:
LOG.info('%s: Free capacity, but need empty host' %
(self.session_id))
self.session.state = 'PREPARE_MAINTENANCE'
else:
LOG.info('Empty host found')
self.session.state = 'START_MAINTENANCE'
def prepare_maintenance(self):
LOG.info("%s: prepare_maintenance called" % self.session_id)
host = self.find_host_to_be_empty()
if not self.confirm_host_to_be_emptied(host, 'PREPARE_MAINTENANCE'):
self.session.state = 'MAINTENANCE_FAILED'
return
if not self.actions_to_have_empty_host(host):
# TBD we found the hard way that we couldn't make host empty and
# need to scale in more. Thigns might fail after this if any
# instance if error or Nova scheduler cached data corrupted for
# what instance on which host
LOG.info('%s: Failed to empty %s. Need to scale in more to get '
'capacity for empty host' % (self.session_id, host))
self.session.state = 'SCALE_IN'
else:
self.session.state = 'START_MAINTENANCE'
self.update_server_info()
def start_maintenance(self):
LOG.info("%s: start_maintenance called" % self.session_id)
empty_hosts = self.get_empty_computes()
if not empty_hosts:
LOG.info("%s: No empty host to be maintained" % self.session_id)
self.session.state = 'MAINTENANCE_FAILED'
return
maintained_hosts = self.get_maintained_hosts_by_type('compute')
if not maintained_hosts:
computes = self.get_compute_hosts()
for compute in computes:
# When we start to maintain compute hosts, all these hosts
# nova-compute service is disabled, so projects cannot have
# instances scheduled to not maintained hosts
self.disable_host_nova_compute(compute)
for host in self.get_controller_hosts():
LOG.info('IN_MAINTENANCE controller %s' % host)
self._admin_notify(self.conf.service_user.os_project_name,
host,
'IN_MAINTENANCE',
self.session_id)
self.host_maintenance(host)
self._admin_notify(self.conf.service_user.os_project_name,
host,
'MAINTENANCE_COMPLETE',
self.session_id)
LOG.info('MAINTENANCE_COMPLETE controller %s' % host)
self.host_maintained(host)
# First we maintain all empty hosts
for host in empty_hosts:
# TBD we wait host VCPUs to report right, but this is not
# correct place. We should handle this after scale in
# also this could be made parallel if more than one empty host
self._wait_host_empty(host)
LOG.info('IN_MAINTENANCE compute %s' % host)
self._admin_notify(self.conf.service_user.os_project_name,
host,
'IN_MAINTENANCE',
self.session_id)
self.host_maintenance(host)
self._admin_notify(self.conf.service_user.os_project_name,
host,
'MAINTENANCE_COMPLETE',
self.session_id)
self.enable_host_nova_compute(host)
LOG.info('MAINTENANCE_COMPLETE compute %s' % host)
self.host_maintained(host)
else:
# Now we maintain hosts gone trough PLANNED_MAINTENANCE
hosts = [h for h in empty_hosts if h not in maintained_hosts]
for host in hosts:
# TBD this could be made parallel if more than one empty host
self._wait_host_empty(host)
LOG.info('IN_MAINTENANCE host %s' % host)
self._admin_notify(self.conf.service_user.os_project_name,
host,
'IN_MAINTENANCE',
self.session_id)
self.host_maintenance(host)
self._admin_notify(self.conf.service_user.os_project_name,
host,
'MAINTENANCE_COMPLETE',
self.session_id)
self.enable_host_nova_compute(host)
LOG.info('MAINTENANCE_COMPLETE host %s' % host)
self.host_maintained(host)
maintained_hosts = self.get_maintained_hosts_by_type('compute')
if len(maintained_hosts) != len(self.get_compute_hosts()):
# Not all host maintained
self.session.state = 'PLANNED_MAINTENANCE'
else:
self.session.state = 'MAINTENANCE_COMPLETE'
def planned_maintenance(self):
LOG.info("%s: planned_maintenance called" % self.session_id)
maintained_hosts = self.get_maintained_hosts_by_type('compute')
compute_hosts = self.get_compute_hosts()
not_maintained_hosts = ([host for host in compute_hosts if host
not in maintained_hosts])
LOG.info("%s: Not maintained hosts: %s" % (self.session_id,
not_maintained_hosts))
host = not_maintained_hosts[0]
if not self.confirm_host_to_be_emptied(host, 'PLANNED_MAINTENANCE'):
self.session.state = 'MAINTENANCE_FAILED'
return
if not self.actions_to_have_empty_host(host):
# Failure in here might indicate action to move instance failed.
# This might be as Nova VCPU capacity was not yet emptied from
# expected target hosts
self.session.state = 'MAINTENANCE_FAILED'
return
self.update_server_info()
self.session.state = 'START_MAINTENANCE'
def maintenance_complete(self):
LOG.info("%s: maintenance_complete called" % self.session_id)
LOG.info('%s: Execute post action plugins' % self.session_id)
self.maintenance_by_plugin_type("localhost", "post")
LOG.info('Projects may still need to up scale back to full '
'capcity')
if not self.confirm_maintenance_complete():
self.session.state = 'MAINTENANCE_FAILED'
return
self.update_server_info()
self.session.state = 'MAINTENANCE_DONE'
def maintenance_done(self):
pass
def maintenance_failed(self):
LOG.info("%s: maintenance_failed called" % self.session_id)
def cleanup(self):
LOG.info("%s: cleanup" % self.session_id)
db_api.remove_session(self.session_id)