deb-sahara/sahara/service/direct_engine.py
Vitaly Gridnev 9941a8dbf8 Add context manager to assign events
In few moments (for example, instances is not created) instance cannot be
provided to assign fail or successful events to current cluster provisioning
step. In such moments, we should use context manager to assign events to cluster.

Change-Id: Id1e3b53e68892fc63afc02642b54256a40b8c19c
Implements: blueprint event-log
2015-01-22 11:22:26 +03:00

564 lines
21 KiB
Python

# Copyright (c) 2013 Mirantis Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from novaclient import exceptions as nova_exceptions
from oslo.config import cfg
import six
from sahara import conductor as c
from sahara import context
from sahara import exceptions as exc
from sahara.i18n import _
from sahara.i18n import _LE
from sahara.i18n import _LI
from sahara.i18n import _LW
from sahara.openstack.common import log as logging
from sahara.service import engine as e
from sahara.service import networks
from sahara.service import volumes
from sahara.utils import cluster_progress_ops as cpo
from sahara.utils import general as g
from sahara.utils.openstack import neutron
from sahara.utils.openstack import nova
conductor = c.API
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
SSH_PORT = 22
class DirectEngine(e.Engine):
def get_type_and_version(self):
return "direct.1.0"
def create_cluster(self, cluster):
ctx = context.ctx()
self._update_rollback_strategy(cluster, shutdown=True)
# create all instances
cluster = g.change_cluster_status(cluster, "Spawning")
self._create_instances(cluster)
# wait for all instances are up and networks ready
cluster = g.change_cluster_status(cluster, "Waiting")
instances = g.get_instances(cluster)
self._await_active(cluster, instances)
self._assign_floating_ips(instances)
self._await_networks(cluster, instances)
cluster = conductor.cluster_get(ctx, cluster)
# attach volumes
volumes.attach_to_instances(g.get_instances(cluster))
# prepare all instances
cluster = g.change_cluster_status(cluster, "Preparing")
self._configure_instances(cluster)
self._update_rollback_strategy(cluster)
def scale_cluster(self, cluster, node_group_id_map):
ctx = context.ctx()
cluster = g.change_cluster_status(cluster, "Scaling")
instance_ids = self._scale_cluster_instances(cluster,
node_group_id_map)
self._update_rollback_strategy(cluster, instance_ids=instance_ids)
cluster = conductor.cluster_get(ctx, cluster)
g.clean_cluster_from_empty_ng(cluster)
cluster = conductor.cluster_get(ctx, cluster)
instances = g.get_instances(cluster, instance_ids)
self._await_active(cluster, instances)
self._assign_floating_ips(instances)
self._await_networks(cluster, instances)
cluster = conductor.cluster_get(ctx, cluster)
volumes.attach_to_instances(
g.get_instances(cluster, instance_ids))
# we should be here with valid cluster: if instances creation
# was not successful all extra-instances will be removed above
if instance_ids:
self._configure_instances(cluster)
self._update_rollback_strategy(cluster)
return instance_ids
def rollback_cluster(self, cluster, reason):
rollback_info = cluster.rollback_info or {}
self._update_rollback_strategy(cluster)
if rollback_info.get('shutdown', False):
self._rollback_cluster_creation(cluster, reason)
return False
instance_ids = rollback_info.get('instance_ids', [])
if instance_ids:
self._rollback_cluster_scaling(
cluster, g.get_instances(cluster, instance_ids), reason)
return True
return False
def _update_rollback_strategy(self, cluster, shutdown=False,
instance_ids=None):
rollback_info = {}
if shutdown:
rollback_info['shutdown'] = shutdown
if instance_ids:
rollback_info['instance_ids'] = instance_ids
cluster = conductor.cluster_update(
context.ctx(), cluster, {'rollback_info': rollback_info})
return cluster
# TODO(alazarev) remove when we fully switch to server groups
def _generate_anti_affinity_groups(self, cluster):
aa_groups = {}
for node_group in cluster.node_groups:
for instance in node_group.instances:
if instance.instance_id:
for process in node_group.node_processes:
if process in cluster.anti_affinity:
aa_group = aa_groups.get(process, [])
aa_group.append(instance.instance_id)
aa_groups[process] = aa_group
return aa_groups
def _create_instances(self, cluster):
ctx = context.ctx()
cluster = self._create_auto_security_groups(cluster)
aa_group = None
if cluster.anti_affinity:
aa_group = self._create_aa_server_group(cluster)
cpo.add_provisioning_step(
cluster.id, _("Run instances"), g.count_instances(cluster))
for node_group in cluster.node_groups:
count = node_group.count
conductor.node_group_update(ctx, node_group, {'count': 0})
for idx in six.moves.xrange(1, count + 1):
self._start_instance(
cluster, node_group, idx, aa_group=aa_group)
def _create_aa_server_group(self, cluster):
server_group_name = g.generate_aa_group_name(cluster.name)
client = nova.client().server_groups
if client.findall(name=server_group_name):
raise exc.InvalidDataException(
_("Server group with name %s is already exists")
% server_group_name)
server_group = client.create(name=server_group_name,
policies=['anti-affinity'])
return server_group.id
def _delete_aa_server_group(self, cluster):
if cluster.anti_affinity:
server_group_name = g.generate_aa_group_name(cluster.name)
client = nova.client().server_groups
server_groups = client.findall(name=server_group_name)
if len(server_groups) == 1:
client.delete(server_groups[0].id)
def _find_aa_server_group(self, cluster):
server_group_name = g.generate_aa_group_name(cluster.name)
server_groups = nova.client().server_groups.findall(
name=server_group_name)
if len(server_groups) > 1:
raise exc.IncorrectStateError(
_("Several server groups with name %s found")
% server_group_name)
if len(server_groups) == 1:
return server_groups[0].id
return None
def _create_auto_security_groups(self, cluster):
ctx = context.ctx()
for node_group in cluster.node_groups:
if node_group.auto_security_group:
self._create_auto_security_group(node_group)
return conductor.cluster_get(ctx, cluster)
def _count_instances_to_scale(self, node_groups_to_enlarge,
node_group_id_map, cluster):
total_count = 0
if node_groups_to_enlarge:
for ng in cluster.node_groups:
if ng.id in node_groups_to_enlarge:
count = node_group_id_map[ng.id]
total_count += count - ng.count
return total_count
def _start_instance(self, cluster, node_group, idx, aa_group,
old_aa_groups=None):
instance_name = g.generate_instance_name(
cluster.name, node_group.name, idx)
current_instance_info = [
cluster.id, None, instance_name, node_group.id]
with context.InstanceInfoManager(current_instance_info):
instance_id = self._run_instance(
cluster, node_group, idx,
aa_group=aa_group, old_aa_groups=old_aa_groups)
return instance_id
def _scale_cluster_instances(self, cluster, node_group_id_map):
ctx = context.ctx()
aa_group = None
old_aa_groups = None
if cluster.anti_affinity:
aa_group = self._find_aa_server_group(cluster)
if not aa_group:
old_aa_groups = self._generate_anti_affinity_groups(cluster)
instances_to_delete = []
node_groups_to_enlarge = set()
node_groups_to_delete = set()
for node_group in cluster.node_groups:
new_count = node_group_id_map[node_group.id]
if new_count < node_group.count:
instances_to_delete += node_group.instances[new_count:
node_group.count]
if new_count == 0:
node_groups_to_delete.add(node_group.id)
elif new_count > node_group.count:
node_groups_to_enlarge.add(node_group.id)
if node_group.count == 0 and node_group.auto_security_group:
self._create_auto_security_group(node_group)
if instances_to_delete:
cluster = g.change_cluster_status(cluster, "Deleting Instances")
for instance in instances_to_delete:
self._shutdown_instance(instance)
self._await_deleted(cluster, instances_to_delete)
for ng in cluster.node_groups:
if ng.id in node_groups_to_delete:
self._delete_auto_security_group(ng)
cluster = conductor.cluster_get(ctx, cluster)
instances_to_add = []
if node_groups_to_enlarge:
cpo.add_provisioning_step(
cluster.id, _("Add instances"),
self._count_instances_to_scale(
node_groups_to_enlarge, node_group_id_map, cluster))
cluster = g.change_cluster_status(cluster, "Adding Instances")
for ng in cluster.node_groups:
if ng.id in node_groups_to_enlarge:
count = node_group_id_map[ng.id]
for idx in six.moves.xrange(ng.count + 1, count + 1):
instance_id = self._start_instance(
cluster, ng, idx, aa_group, old_aa_groups)
instances_to_add.append(instance_id)
return instances_to_add
def _map_security_groups(self, security_groups):
if not security_groups:
# Nothing to do here
return None
if CONF.use_neutron:
# When using Neutron, ids work fine.
return security_groups
else:
# Nova Network requires that security groups are passed by names.
# security_groups.get method accepts both ID and names, so in case
# IDs are provided they will be converted, otherwise the names will
# just map to themselves.
names = []
for group_id_or_name in security_groups:
group = nova.client().security_groups.get(group_id_or_name)
names.append(group.name)
return names
@cpo.event_wrapper_without_instance(mark_successful_on_exit=True)
def _run_instance(self, cluster, node_group, idx, aa_group=None,
old_aa_groups=None):
"""Create instance using nova client and persist them into DB."""
ctx = context.ctx()
name = g.generate_instance_name(cluster.name, node_group.name, idx)
userdata = self._generate_user_data_script(node_group, name)
if old_aa_groups:
# aa_groups: node process -> instance ids
aa_ids = []
for node_process in node_group.node_processes:
aa_ids += old_aa_groups.get(node_process) or []
# create instances only at hosts w/ no instances
# w/ aa-enabled processes
hints = {'different_host': sorted(set(aa_ids))} if aa_ids else None
else:
hints = {'group': aa_group} if (
aa_group and self._need_aa_server_group(node_group)) else None
security_groups = self._map_security_groups(node_group.security_groups)
nova_kwargs = {'scheduler_hints': hints, 'userdata': userdata,
'key_name': cluster.user_keypair_id,
'security_groups': security_groups,
'availability_zone': node_group.availability_zone}
if CONF.use_neutron:
net_id = cluster.neutron_management_network
nova_kwargs['nics'] = [{"net-id": net_id, "v4-fixed-ip": ""}]
nova_instance = nova.client().servers.create(name,
node_group.get_image_id(),
node_group.flavor_id,
**nova_kwargs)
instance_id = conductor.instance_add(ctx, node_group,
{"instance_id": nova_instance.id,
"instance_name": name})
if old_aa_groups:
# save instance id to aa_groups to support aa feature
for node_process in node_group.node_processes:
if node_process in cluster.anti_affinity:
aa_group_ids = old_aa_groups.get(node_process, [])
aa_group_ids.append(nova_instance.id)
old_aa_groups[node_process] = aa_group_ids
return instance_id
def _create_auto_security_group(self, node_group):
name = g.generate_auto_security_group_name(node_group)
nova_client = nova.client()
security_group = nova_client.security_groups.create(
name, "Auto security group created by Sahara for Node Group '%s' "
"of cluster '%s'." %
(node_group.name, node_group.cluster.name))
# ssh remote needs ssh port, agents are not implemented yet
nova_client.security_group_rules.create(
security_group.id, 'tcp', SSH_PORT, SSH_PORT, "0.0.0.0/0")
# open all traffic for private networks
if CONF.use_neutron:
for cidr in neutron.get_private_network_cidrs(node_group.cluster):
for protocol in ['tcp', 'udp']:
nova_client.security_group_rules.create(
security_group.id, protocol, 1, 65535, cidr)
nova_client.security_group_rules.create(
security_group.id, 'icmp', -1, -1, cidr)
# enable ports returned by plugin
for port in node_group.open_ports:
nova_client.security_group_rules.create(
security_group.id, 'tcp', port, port, "0.0.0.0/0")
security_groups = list(node_group.security_groups or [])
security_groups.append(security_group.id)
conductor.node_group_update(context.ctx(), node_group,
{"security_groups": security_groups})
return security_groups
def _need_aa_server_group(self, node_group):
for node_process in node_group.node_processes:
if node_process in node_group.cluster.anti_affinity:
return True
return False
def _assign_floating_ips(self, instances):
for instance in instances:
node_group = instance.node_group
if node_group.floating_ip_pool:
networks.assign_floating_ip(instance.instance_id,
node_group.floating_ip_pool)
def _await_active(self, cluster, instances):
"""Await all instances are in Active status and available."""
if not instances:
return
cpo.add_provisioning_step(
cluster.id, _("Wait for instances to become active"),
len(instances))
active_ids = set()
while len(active_ids) != len(instances):
if not g.check_cluster_exists(cluster):
return
for instance in instances:
if instance.id not in active_ids:
if self._check_if_active(instance):
active_ids.add(instance.id)
cpo.add_successful_event(instance)
context.sleep(1)
LOG.info(_LI("Cluster '%s': all instances are active"), cluster.id)
def _await_deleted(self, cluster, instances):
"""Await all instances are deleted."""
if not instances:
return
cpo.add_provisioning_step(
cluster.id, _("Wait for instances to be deleted"), len(instances))
deleted_ids = set()
while len(deleted_ids) != len(instances):
if not g.check_cluster_exists(cluster):
return
for instance in instances:
if instance.id not in deleted_ids:
if self._check_if_deleted(instance):
LOG.debug("Instance '%s' is deleted" %
instance.instance_name)
deleted_ids.add(instance.id)
cpo.add_successful_event(instance)
context.sleep(1)
@cpo.event_wrapper(mark_successful_on_exit=False)
def _check_if_active(self, instance):
server = nova.get_instance_info(instance)
if server.status == 'ERROR':
raise exc.SystemError(_("Node %s has error status") % server.name)
return server.status == 'ACTIVE'
@cpo.event_wrapper(mark_successful_on_exit=False)
def _check_if_deleted(self, instance):
try:
nova.get_instance_info(instance)
except nova_exceptions.NotFound:
return True
return False
def _rollback_cluster_creation(self, cluster, ex):
"""Shutdown all instances and update cluster status."""
LOG.info(_LI("Cluster '%(name)s' creation rollback "
"(reason: %(reason)s)"),
{'name': cluster.name, 'reason': ex})
self.shutdown_cluster(cluster)
def _rollback_cluster_scaling(self, cluster, instances, ex):
"""Attempt to rollback cluster scaling."""
LOG.info(_LI("Cluster '%(name)s' scaling rollback "
"(reason: %(reason)s)"),
{'name': cluster.name, 'reason': ex})
for i in instances:
self._shutdown_instance(i)
cluster = conductor.cluster_get(context.ctx(), cluster)
g.clean_cluster_from_empty_ng(cluster)
def _shutdown_instances(self, cluster):
for node_group in cluster.node_groups:
for instance in node_group.instances:
self._shutdown_instance(instance)
self._await_deleted(cluster, node_group.instances)
self._delete_auto_security_group(node_group)
def _delete_auto_security_group(self, node_group):
if not node_group.auto_security_group:
return
if not node_group.security_groups:
# node group has no security groups
# nothing to delete
return
name = node_group.security_groups[-1]
try:
client = nova.client().security_groups
security_group = client.get(name)
if (security_group.name !=
g.generate_auto_security_group_name(node_group)):
LOG.warn(_LW("Auto security group for node group %s is not "
"found"), node_group.name)
return
client.delete(name)
except Exception:
LOG.exception(_LE("Failed to delete security group %s"), name)
def _shutdown_instance(self, instance):
ctx = context.ctx()
if instance.node_group.floating_ip_pool:
try:
networks.delete_floating_ip(instance.instance_id)
except nova_exceptions.NotFound:
LOG.warn(_LW("Attempted to delete non-existent floating IP in "
"pool %(pool)s from instance %(instance)s"),
{'pool': instance.node_group.floating_ip_pool,
'instance': instance.instance_id})
try:
volumes.detach_from_instance(instance)
except Exception:
LOG.warn(_LW("Detaching volumes from instance %s failed"),
instance.instance_id)
try:
nova.client().servers.delete(instance.instance_id)
except nova_exceptions.NotFound:
LOG.warn(_LW("Attempted to delete non-existent instance %s"),
instance.instance_id)
conductor.instance_remove(ctx, instance)
def shutdown_cluster(self, cluster):
"""Shutdown specified cluster and all related resources."""
self._shutdown_instances(cluster)
self._clean_job_executions(cluster)
self._delete_aa_server_group(cluster)