485 lines
19 KiB
Python
485 lines
19 KiB
Python
# Copyright 2016 Huawei Technologies Co.,LTD.
|
|
# All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
|
# not use this file except in compliance with the License. You may obtain
|
|
# a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
|
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
|
# License for the specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from oslo_log import log
|
|
import oslo_messaging as messaging
|
|
from oslo_service import periodic_task
|
|
from oslo_utils import excutils
|
|
from oslo_utils import timeutils
|
|
|
|
from mogan.common import exception
|
|
from mogan.common import flow_utils
|
|
from mogan.common.i18n import _
|
|
from mogan.common import states
|
|
from mogan.common import utils
|
|
from mogan.conf import CONF
|
|
from mogan.engine import base_manager
|
|
from mogan.engine.flows import create_instance
|
|
from mogan.notifications import base as notifications
|
|
from mogan import objects
|
|
from mogan.objects import fields
|
|
|
|
LOG = log.getLogger(__name__)
|
|
|
|
|
|
class EngineManager(base_manager.BaseEngineManager):
|
|
"""Mogan Engine manager main class."""
|
|
|
|
RPC_API_VERSION = '1.0'
|
|
|
|
target = messaging.Target(version=RPC_API_VERSION)
|
|
|
|
def _get_compute_port(self, context, port_uuid):
|
|
"""Gets compute port by the uuid."""
|
|
try:
|
|
return objects.ComputePort.get(context, port_uuid)
|
|
except exception.NotFound:
|
|
LOG.warning("No compute port record for %(port)s",
|
|
{'port': port_uuid})
|
|
|
|
def _get_compute_node(self, context, node_uuid):
|
|
"""Gets compute node by the uuid."""
|
|
try:
|
|
return objects.ComputeNode.get(context, node_uuid)
|
|
except exception.NotFound:
|
|
LOG.warning("No compute node record for %(node)s",
|
|
{'node': node_uuid})
|
|
|
|
def _init_compute_port(self, context, port):
|
|
"""Initialize the compute port if it does not already exist.
|
|
|
|
:param context: security context
|
|
:param port: initial values
|
|
"""
|
|
|
|
# now try to get the compute port record from the
|
|
# database. If we get one we use resources to initialize
|
|
cp = self._get_compute_port(context, port['port_uuid'])
|
|
if cp:
|
|
cp.update_from_driver(port)
|
|
cp.save()
|
|
return
|
|
|
|
# there was no compute port in the database so we need to create
|
|
# a new compute port. This needs to be initialized with node values.
|
|
cp = objects.ComputePort(context)
|
|
cp.update_from_driver(port)
|
|
cp.create()
|
|
|
|
def _init_compute_node(self, context, node):
|
|
"""Initialize the compute node if it does not already exist.
|
|
|
|
:param context: security context
|
|
:param node: initial values
|
|
"""
|
|
|
|
# now try to get the compute node record from the
|
|
# database. If we get one we use resources to initialize
|
|
cn = self._get_compute_node(context, node['node_uuid'])
|
|
if cn:
|
|
cn.update_from_driver(node)
|
|
cn.save()
|
|
else:
|
|
# there was no compute node in the database so we need to
|
|
# create a new compute node. This needs to be initialized
|
|
# with node values.
|
|
cn = objects.ComputeNode(context)
|
|
cn.update_from_driver(node)
|
|
cn.create()
|
|
|
|
# Record compute ports to db
|
|
for port in node['ports']:
|
|
# initialize the compute port object, creating it
|
|
# if it does not already exist.
|
|
self._init_compute_port(context, port)
|
|
|
|
@periodic_task.periodic_task(
|
|
spacing=CONF.engine.update_resources_interval,
|
|
run_immediately=True)
|
|
def _update_available_resources(self, context):
|
|
"""See driver.get_available_resource()
|
|
|
|
Periodic process that keeps that the engine's understanding of
|
|
resource availability in sync with the underlying hypervisor.
|
|
|
|
:param context: security context
|
|
"""
|
|
nodes = self.driver.get_available_resources()
|
|
compute_nodes_in_db = objects.ComputeNodeList.get_all(context)
|
|
|
|
# Record compute nodes to db
|
|
for uuid, node in nodes.items():
|
|
# initialize the compute node object, creating it
|
|
# if it does not already exist.
|
|
self._init_compute_node(context, node)
|
|
|
|
# Delete orphan compute node not reported by driver but still in db
|
|
for cn in compute_nodes_in_db:
|
|
if cn.node_uuid not in nodes:
|
|
LOG.info("Deleting orphan compute node %(id)s)",
|
|
{'id': cn.node_uuid})
|
|
cn.destroy()
|
|
|
|
@periodic_task.periodic_task(spacing=CONF.engine.sync_power_state_interval,
|
|
run_immediately=True)
|
|
def _sync_power_states(self, context):
|
|
"""Align power states between the database and the hypervisor."""
|
|
|
|
# Only fetching the necessary fields, will skip synchronizing if
|
|
# target_power_state is not None.
|
|
|
|
try:
|
|
nodes = self.driver.get_nodes_power_state()
|
|
except Exception as e:
|
|
LOG.warning(
|
|
("Failed to retrieve node list when synchronizing power "
|
|
"states: %(msg)s") % {"msg": e})
|
|
# Just retrun if we fail to get nodes real power state.
|
|
return
|
|
|
|
node_dict = {node.instance_uuid: node for node in nodes
|
|
if node.target_power_state is None}
|
|
|
|
if not node_dict:
|
|
LOG.warning("While synchronizing instance power states, "
|
|
"found none instance with stable power state "
|
|
"on the hypervisor.")
|
|
return
|
|
|
|
def _sync(db_instance, node_power_state):
|
|
# This must be synchronized as we query state from two separate
|
|
# sources, the driver (ironic) and the database. They are set
|
|
# (in stop_instance) and read, in sync.
|
|
@utils.synchronized(db_instance.uuid)
|
|
def sync_instance_power_state():
|
|
self._sync_instance_power_state(context, db_instance,
|
|
node_power_state)
|
|
|
|
try:
|
|
sync_instance_power_state()
|
|
except Exception:
|
|
LOG.exception("Periodic sync_power_state task had an "
|
|
"error while processing an instance.",
|
|
instance=db_instance)
|
|
|
|
self._syncs_in_progress.pop(db_instance.uuid)
|
|
|
|
db_instances = objects.Instance.list(context)
|
|
for db_instance in db_instances:
|
|
# process syncs asynchronously - don't want instance locking to
|
|
# block entire periodic task thread
|
|
uuid = db_instance.uuid
|
|
if uuid in self._syncs_in_progress:
|
|
LOG.debug('Sync power state already in progress for %s', uuid)
|
|
continue
|
|
|
|
if db_instance.status not in (states.ACTIVE, states.STOPPED):
|
|
if db_instance.status in states.UNSTABLE_STATES:
|
|
LOG.info("During sync_power_state the instance has a "
|
|
"pending task (%(task)s). Skip.",
|
|
{'task': db_instance.status},
|
|
instance=db_instance)
|
|
continue
|
|
|
|
if uuid not in node_dict:
|
|
continue
|
|
|
|
node_power_state = node_dict[uuid].power_state
|
|
if db_instance.power_state != node_power_state:
|
|
LOG.debug('Triggering sync for uuid %s', uuid)
|
|
self._syncs_in_progress[uuid] = True
|
|
self._sync_power_pool.spawn_n(_sync, db_instance,
|
|
node_power_state)
|
|
|
|
def _sync_instance_power_state(self, context, db_instance,
|
|
node_power_state):
|
|
"""Align instance power state between the database and hypervisor.
|
|
|
|
If the instance is not found on the hypervisor, but is in the database,
|
|
then a stop() API will be called on the instance.
|
|
"""
|
|
|
|
# We re-query the DB to get the latest instance info to minimize
|
|
# (not eliminate) race condition.
|
|
db_instance.refresh()
|
|
db_power_state = db_instance.power_state
|
|
|
|
if db_instance.status not in (states.ACTIVE, states.STOPPED):
|
|
# on the receiving end of mogan-engine, it could happen
|
|
# that the DB instance already report the new resident
|
|
# but the actual BM has not showed up on the hypervisor
|
|
# yet. In this case, let's allow the loop to continue
|
|
# and run the state sync in a later round
|
|
LOG.info("During sync_power_state the instance has a "
|
|
"pending task (%(task)s). Skip.",
|
|
{'task': db_instance.task_state},
|
|
instance=db_instance)
|
|
return
|
|
|
|
if node_power_state != db_power_state:
|
|
LOG.info('During _sync_instance_power_state the DB '
|
|
'power_state (%(db_power_state)s) does not match '
|
|
'the node_power_state from the hypervisor '
|
|
'(%(node_power_state)s). Updating power_state in the '
|
|
'DB to match the hypervisor.',
|
|
{'db_power_state': db_power_state,
|
|
'node_power_state': node_power_state},
|
|
instance=db_instance)
|
|
# power_state is always updated from hypervisor to db
|
|
db_instance.power_state = node_power_state
|
|
db_instance.save()
|
|
|
|
@periodic_task.periodic_task(spacing=CONF.engine.sync_maintenance_interval,
|
|
run_immediately=True)
|
|
def _sync_maintenance_states(self, context):
|
|
"""Align maintenance states between the database and the hypervisor."""
|
|
|
|
try:
|
|
nodes = self.driver.get_maintenance_node_list()
|
|
except Exception as e:
|
|
LOG.warning(
|
|
"Failed to retrieve node list when synchronizing "
|
|
"maintenance states: %(msg)s" % {"msg": e})
|
|
# Just retrun if we fail to get nodes maintenance state.
|
|
return
|
|
|
|
node_dict = {node.instance_uuid: node for node in nodes}
|
|
|
|
if not node_dict:
|
|
LOG.warning("While synchronizing instance maintenance states, "
|
|
"found none node with instance associated on the "
|
|
"hypervisor.")
|
|
return
|
|
|
|
db_instances = objects.Instance.list(context)
|
|
for instance in db_instances:
|
|
uuid = instance.uuid
|
|
|
|
# If instance in unstable states and the node goes to maintenance,
|
|
# just skip the syncing process as the pending task should be goes
|
|
# to error state instead.
|
|
if instance.status in states.UNSTABLE_STATES:
|
|
LOG.info("During sync_maintenance_state the instance "
|
|
"has a pending task (%(task)s). Skip.",
|
|
{'task': instance.status},
|
|
instance=instance)
|
|
continue
|
|
|
|
if uuid not in node_dict:
|
|
continue
|
|
|
|
node_maintenance = node_dict[uuid].maintenance
|
|
|
|
if instance.status == states.MAINTENANCE and not node_maintenance:
|
|
# TODO(zhenguo): need to check whether we need states machine
|
|
# transition here, and currently we just move to ACTIVE state
|
|
# regardless of it's real power state which may need sync power
|
|
# state periodic task to correct it.
|
|
instance.status = states.ACTIVE
|
|
instance.save()
|
|
elif node_maintenance and instance.status != states.MAINTENANCE:
|
|
instance.status = states.MAINTENANCE
|
|
instance.save()
|
|
|
|
def destroy_networks(self, context, instance):
|
|
ports = instance.nics.get_port_ids()
|
|
for port in ports:
|
|
self.network_api.delete_port(context, port, instance.uuid)
|
|
|
|
def _unplug_vifs(self, context, instance):
|
|
LOG.debug("unplug: instance_uuid=%(uuid)s vif=%(instance_nics)s",
|
|
{'uuid': instance.uuid,
|
|
'instance_nics': str(instance.nics)})
|
|
|
|
bm_interface = self.driver.get_ports_from_node(instance.node_uuid)
|
|
|
|
for pif in bm_interface:
|
|
self.driver.unplug_vif(pif)
|
|
|
|
def create_instance(self, context, instance, requested_networks,
|
|
request_spec=None, filter_properties=None):
|
|
"""Perform a deployment."""
|
|
LOG.debug("Starting instance...", instance=instance)
|
|
notifications.notify_about_instance_action(
|
|
context, instance, self.host,
|
|
action=fields.NotificationAction.CREATE,
|
|
phase=fields.NotificationPhase.START)
|
|
|
|
fsm = utils.get_state_machine(start_state=instance.status,
|
|
target_state=states.ACTIVE)
|
|
|
|
if filter_properties is None:
|
|
filter_properties = {}
|
|
|
|
try:
|
|
node = self.scheduler_rpcapi.select_destinations(
|
|
context, request_spec, filter_properties)
|
|
instance.node_uuid = node['node_uuid']
|
|
instance.save()
|
|
except Exception as e:
|
|
utils.process_event(fsm, instance, event='error')
|
|
LOG.error("Created instance %(uuid)s failed."
|
|
"Exception: %(exception)s",
|
|
{"uuid": instance.uuid,
|
|
"exception": e})
|
|
return
|
|
|
|
try:
|
|
flow_engine = create_instance.get_flow(
|
|
context,
|
|
self,
|
|
instance,
|
|
requested_networks,
|
|
request_spec,
|
|
filter_properties,
|
|
)
|
|
except Exception:
|
|
utils.process_event(fsm, instance, event='error')
|
|
msg = _("Create manager instance flow failed.")
|
|
LOG.exception(msg)
|
|
raise exception.MoganException(msg)
|
|
|
|
def _run_flow():
|
|
# This code executes create instance flow. If something goes wrong,
|
|
# flow reverts all job that was done and reraises an exception.
|
|
# Otherwise, all data that was generated by flow becomes available
|
|
# in flow engine's storage.
|
|
with flow_utils.DynamicLogListener(flow_engine, logger=LOG):
|
|
flow_engine.run()
|
|
|
|
try:
|
|
_run_flow()
|
|
except Exception as e:
|
|
instance.power_state = states.NOSTATE
|
|
utils.process_event(fsm, instance, event='error')
|
|
LOG.error("Created instance %(uuid)s failed."
|
|
"Exception: %(exception)s",
|
|
{"uuid": instance.uuid,
|
|
"exception": e})
|
|
else:
|
|
# Advance the state model for the given event. Note that this
|
|
# doesn't alter the instance in any way. This may raise
|
|
# InvalidState, if this event is not allowed in the current state.
|
|
instance.power_state = self.driver.get_power_state(context,
|
|
instance.uuid)
|
|
instance.launched_at = timeutils.utcnow()
|
|
utils.process_event(fsm, instance, event='done')
|
|
LOG.info("Created instance %s successfully.", instance.uuid)
|
|
finally:
|
|
return instance
|
|
|
|
def _delete_instance(self, context, instance):
|
|
"""Delete an instance
|
|
|
|
:param context: mogan request context
|
|
:param instance: instance object
|
|
"""
|
|
# TODO(zhenguo): Add delete notification
|
|
|
|
self.driver.destroy(context, instance)
|
|
|
|
def delete_instance(self, context, instance):
|
|
"""Delete an instance."""
|
|
LOG.debug("Deleting instance...")
|
|
|
|
fsm = utils.get_state_machine(start_state=instance.status,
|
|
target_state=states.DELETED)
|
|
|
|
@utils.synchronized(instance.uuid)
|
|
def do_delete_instance(instance):
|
|
try:
|
|
self._delete_instance(context, instance)
|
|
self._unplug_vifs(context, instance)
|
|
except exception.InstanceNotFound:
|
|
LOG.info("Instance disappeared during terminate",
|
|
instance=instance)
|
|
except Exception:
|
|
# As we're trying to delete always go to Error if something
|
|
# goes wrong that _delete_instance can't handle.
|
|
with excutils.save_and_reraise_exception():
|
|
LOG.exception('Setting instance status to ERROR',
|
|
instance=instance)
|
|
instance.power_state = states.NOSTATE
|
|
utils.process_event(fsm, instance, event='error')
|
|
|
|
do_delete_instance(instance)
|
|
|
|
instance.power_state = states.NOSTATE
|
|
utils.process_event(fsm, instance, event='done')
|
|
instance.destroy()
|
|
|
|
def set_power_state(self, context, instance, state):
|
|
"""Set power state for the specified instance."""
|
|
|
|
fsm = utils.get_state_machine(start_state=instance.status)
|
|
|
|
@utils.synchronized(instance.uuid)
|
|
def do_set_power_state():
|
|
LOG.debug('Power %(state)s called for instance %(instance)s',
|
|
{'state': state,
|
|
'instance': instance})
|
|
self.driver.set_power_state(context, instance, state)
|
|
|
|
do_set_power_state()
|
|
instance.power_state = self.driver.get_power_state(context,
|
|
instance.uuid)
|
|
utils.process_event(fsm, instance, event='done')
|
|
LOG.info('Successfully set node power state: %s',
|
|
state, instance=instance)
|
|
|
|
def _rebuild_instance(self, context, instance):
|
|
"""Perform rebuild action on the specified instance."""
|
|
|
|
# TODO(zhenguo): Add delete notification
|
|
|
|
self.driver.rebuild(context, instance)
|
|
|
|
def rebuild_instance(self, context, instance):
|
|
"""Destroy and re-make this instance.
|
|
|
|
:param context: mogan request context
|
|
:param instance: instance object
|
|
"""
|
|
|
|
LOG.debug('Rebuilding instance', instance=instance)
|
|
|
|
fsm = utils.get_state_machine(start_state=instance.status)
|
|
|
|
try:
|
|
self._rebuild_instance(context, instance)
|
|
except Exception as e:
|
|
utils.process_event(fsm, instance, event='error')
|
|
LOG.error("Rebuild instance %(uuid)s failed."
|
|
"Exception: %(exception)s",
|
|
{"uuid": instance.uuid,
|
|
"exception": e})
|
|
return
|
|
|
|
utils.process_event(fsm, instance, event='done')
|
|
LOG.info('Instance was successfully rebuilt', instance=instance)
|
|
|
|
def list_availability_zones(self, context):
|
|
"""Get availability zone list."""
|
|
compute_nodes = objects.ComputeNodeList.get_all_available(context)
|
|
|
|
azs = set()
|
|
for node in compute_nodes:
|
|
az = node.availability_zone \
|
|
or CONF.engine.default_availability_zone
|
|
if az is not None:
|
|
azs.add(az)
|
|
|
|
return {'availability_zones': list(azs)}
|