diff --git a/magnum/drivers/heat/driver.py b/magnum/drivers/heat/driver.py index b3fb1e39e6..4c85d5b59a 100755 --- a/magnum/drivers/heat/driver.py +++ b/magnum/drivers/heat/driver.py @@ -11,6 +11,7 @@ # under the License. import abc +import collections import os import six @@ -38,6 +39,10 @@ from magnum.objects import fields LOG = logging.getLogger(__name__) +NodeGroupStatus = collections.namedtuple('NodeGroupStatus', + 'status reason is_default') + + @six.add_metaclass(abc.ABCMeta) class HeatDriver(driver.Driver): """Base Driver class for using Heat @@ -58,12 +63,14 @@ class HeatDriver(driver.Driver): scale_manager=scale_manager) def _extract_template_definition(self, context, cluster, - scale_manager=None): + scale_manager=None, + nodegroups=None): cluster_template = conductor_utils.retrieve_cluster_template(context, cluster) definition = self.get_template_definition() return definition.extract_definition(context, cluster_template, cluster, + nodegroups=nodegroups, scale_manager=scale_manager) def _get_env_files(self, template_path, env_rel_paths): @@ -135,7 +142,12 @@ class HeatDriver(driver.Driver): self.pre_delete_cluster(context, cluster) LOG.info("Starting to delete cluster %s", cluster.uuid) - self._delete_stack(context, clients.OpenStackClients(context), cluster) + osc = clients.OpenStackClients(context) + for ng in cluster.nodegroups: + if ng.is_default: + continue + self._delete_stack(context, osc, ng.stack_id) + self._delete_stack(context, osc, cluster.default_ng_master.stack_id) def resize_cluster(self, context, cluster, resize_manager, node_count, nodes_to_remove, nodegroup=None, @@ -174,6 +186,8 @@ class HeatDriver(driver.Driver): # no cluster_create_timeout value was passed in to the request # so falling back on configuration file value heat_timeout = cfg.CONF.cluster_heat.create_timeout + + heat_params['is_cluster_stack'] = True fields = { 'stack_name': stack_name, 'parameters': heat_params, @@ -222,10 +236,10 @@ class HeatDriver(driver.Driver): # Find what changed checking the stack params # against the ones in the template_def. - stack = osc.heat().stacks.get(cluster.stack_id, + stack = osc.heat().stacks.get(nodegroup.stack_id, resolve_outputs=True) stack_params = stack.parameters - definition.add_nodegroup_params(cluster) + definition.add_nodegroup_params(cluster, nodegroups=[nodegroup]) heat_params = definition.get_stack_diff(context, stack_params, cluster) LOG.debug('Updating stack with these params: %s', heat_params) @@ -241,10 +255,10 @@ class HeatDriver(driver.Driver): } osc = clients.OpenStackClients(context) - osc.heat().stacks.update(cluster.stack_id, **fields) + osc.heat().stacks.update(nodegroup.stack_id, **fields) - def _delete_stack(self, context, osc, cluster): - osc.heat().stacks.delete(cluster.stack_id) + def _delete_stack(self, context, osc, stack_id): + osc.heat().stacks.delete(stack_id) class HeatPoller(object): @@ -260,39 +274,79 @@ class HeatPoller(object): def poll_and_check(self): # TODO(yuanying): temporary implementation to update api_address, # node_addresses and cluster status + ng_statuses = list() + for nodegroup in self.cluster.nodegroups: + self.nodegroup = nodegroup + status = self.extract_nodegroup_status() + ng_statuses.append(status) + self.aggregate_nodegroup_statuses(ng_statuses) + + def extract_nodegroup_status(self): try: # Do not resolve outputs by default. Resolving all # node IPs is expensive on heat. stack = self.openstack_client.heat().stacks.get( - self.cluster.stack_id, resolve_outputs=False) + self.nodegroup.stack_id, resolve_outputs=False) + + # poll_and_check is detached and polling long time to check + # status, so another user/client can call delete cluster/stack. + if stack.stack_status == fields.ClusterStatus.DELETE_COMPLETE: + if self.nodegroup.is_default: + if self.nodegroup.role == 'master': + # Delete all cluster related info + self._delete_complete() + else: + self.nodegroup.destroy() + + if stack.stack_status in (fields.ClusterStatus.CREATE_COMPLETE, + fields.ClusterStatus.UPDATE_COMPLETE): + # Resolve all outputs if the stack is COMPLETE + stack = self.openstack_client.heat().stacks.get( + self.nodegroup.stack_id, resolve_outputs=True) + + self._sync_cluster_and_template_status(stack) + elif stack.stack_status != self.nodegroup.status: + self.template_def.nodegroup_output_mappings = list() + self.template_def.update_outputs( + stack, self.cluster_template, self.cluster, + nodegroups=[self.nodegroup]) + self._sync_cluster_status(stack) + + if stack.stack_status in (fields.ClusterStatus.CREATE_FAILED, + fields.ClusterStatus.DELETE_FAILED, + fields.ClusterStatus.UPDATE_FAILED, + fields.ClusterStatus.ROLLBACK_COMPLETE, + fields.ClusterStatus.ROLLBACK_FAILED): + self._sync_cluster_and_template_status(stack) + self._cluster_failed(stack) except heatexc.NotFound: self._sync_missing_heat_stack() - return + return NodeGroupStatus(status=self.nodegroup.status, + is_default=self.nodegroup.is_default, + reason=self.nodegroup.status_reason) - # poll_and_check is detached and polling long time to check status, - # so another user/client can call delete cluster/stack. - if stack.stack_status == fields.ClusterStatus.DELETE_COMPLETE: - self._delete_complete() + def aggregate_nodegroup_statuses(self, ng_statuses): + # NOTE(ttsiouts): Aggregate the nodegroup statuses and set the + # cluster overall status. + FAIL = '_FAILED' + IN_PROGRESS = '_IN_PROGRESS' + COMPLETE = '_COMPLETE' - if stack.stack_status in (fields.ClusterStatus.CREATE_COMPLETE, - fields.ClusterStatus.UPDATE_COMPLETE): - # Resolve all outputs if the stack is COMPLETE - stack = self.openstack_client.heat().stacks.get( - self.cluster.stack_id, resolve_outputs=True) - - self._sync_cluster_and_template_status(stack) - elif stack.stack_status != self.cluster.status: - self.template_def.update_outputs(stack, self.cluster_template, - self.cluster) - self._sync_cluster_status(stack) - - if stack.stack_status in (fields.ClusterStatus.CREATE_FAILED, - fields.ClusterStatus.DELETE_FAILED, - fields.ClusterStatus.UPDATE_FAILED, - fields.ClusterStatus.ROLLBACK_COMPLETE, - fields.ClusterStatus.ROLLBACK_FAILED): - self._sync_cluster_and_template_status(stack) - self._cluster_failed(stack) + CREATE = 'CREATE' + DELETE = 'DELETE' + UPDATE = 'UPDATE' + # Keep priority to the states below + for state in (IN_PROGRESS, FAIL, COMPLETE): + if any(ns.status.endswith(state) for ns in ng_statuses): + status = getattr(fields.ClusterStatus, UPDATE+state) + self.cluster.status = status + for action in (CREATE, DELETE): + if all(ns.status == action+state + for ns in ng_statuses if ns.is_default): + status = getattr(fields.ClusterStatus, action+state) + self.cluster.status = status + break + self.cluster.save() def _delete_complete(self): LOG.info('Cluster has been deleted, stack_id: %s', @@ -311,9 +365,9 @@ class HeatPoller(object): self.cluster.uuid) def _sync_cluster_status(self, stack): - self.cluster.status = stack.stack_status - self.cluster.status_reason = stack.stack_status_reason - self.cluster.save() + self.nodegroup.status = stack.stack_status + self.nodegroup.status_reason = stack.stack_status_reason + self.nodegroup.save() def get_version_info(self, stack): stack_param = self.template_def.get_heat_param( @@ -330,8 +384,10 @@ class HeatPoller(object): self.cluster.container_version = container_version def _sync_cluster_and_template_status(self, stack): + self.template_def.nodegroup_output_mappings = list() self.template_def.update_outputs(stack, self.cluster_template, - self.cluster) + self.cluster, + nodegroups=[self.nodegroup]) self.get_version_info(stack) self._sync_cluster_status(stack) @@ -345,19 +401,20 @@ class HeatPoller(object): def _sync_missing_heat_stack(self): if self.cluster.status == fields.ClusterStatus.DELETE_IN_PROGRESS: - self._delete_complete() + if self.nodegroup.is_default and self.nodegroup.role == 'master': + self._delete_complete() elif self.cluster.status == fields.ClusterStatus.CREATE_IN_PROGRESS: self._sync_missing_stack(fields.ClusterStatus.CREATE_FAILED) elif self.cluster.status == fields.ClusterStatus.UPDATE_IN_PROGRESS: self._sync_missing_stack(fields.ClusterStatus.UPDATE_FAILED) def _sync_missing_stack(self, new_status): - self.cluster.status = new_status - self.cluster.status_reason = _("Stack with id %s not found in " - "Heat.") % self.cluster.stack_id - self.cluster.save() - LOG.info("Cluster with id %(id)s has been set to " + self.nodegroup.status = new_status + self.nodegroup.status_reason = _("Stack with id %s not found in " + "Heat.") % self.cluster.stack_id + self.nodegroup.save() + LOG.info("Nodegroup with id %(id)s has been set to " "%(status)s due to stack with id %(sid)s " "not found in Heat.", - {'id': self.cluster.id, 'status': self.cluster.status, - 'sid': self.cluster.stack_id}) + {'id': self.nodegroup.uuid, 'status': self.nodegroup.status, + 'sid': self.nodegroup.stack_id}) diff --git a/magnum/drivers/k8s_fedora_atomic_v1/driver.py b/magnum/drivers/k8s_fedora_atomic_v1/driver.py index d5311a17ca..df63b17647 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/driver.py +++ b/magnum/drivers/k8s_fedora_atomic_v1/driver.py @@ -12,13 +12,20 @@ # License for the specific language governing permissions and limitations # under the License. +from oslo_config import cfg from oslo_log import log as logging from pbr.version import SemanticVersion as SV +from string import ascii_letters +from string import digits + +from heatclient.common import template_utils + from magnum.common import clients from magnum.common import exception from magnum.common import keystone from magnum.common import octavia +from magnum.common import short_id from magnum.drivers.common import k8s_monitor from magnum.drivers.heat import driver from magnum.drivers.k8s_fedora_atomic_v1 import template_def @@ -109,3 +116,71 @@ class Driver(driver.HeatDriver): 'disable_rollback': not rollback } osc.heat().stacks.update(cluster.stack_id, **fields) + + def create_nodegroup(self, context, cluster, nodegroup): + osc = clients.OpenStackClients(context) + template_path, stack_params, env_files = ( + self._extract_template_definition(context, cluster, + nodegroups=[nodegroup])) + + tpl_files, template = template_utils.get_template_contents( + template_path) + + environment_files, env_map = self._get_env_files(template_path, + env_files) + tpl_files.update(env_map) + + # Make sure we end up with a valid hostname + valid_chars = set(ascii_letters + digits + '-') + + # valid hostnames are 63 chars long, leaving enough room + # to add the random id (for uniqueness) + stack_name = "%s-%s" % (cluster.name[:20], nodegroup.name[:9]) + stack_name = stack_name.replace('_', '-') + stack_name = stack_name.replace('.', '-') + stack_name = ''.join(filter(valid_chars.__contains__, stack_name)) + + # Make sure no duplicate stack name + stack_name = '%s-%s' % (stack_name, short_id.generate_id()) + stack_name = stack_name.lower() + if cluster.create_timeout: + heat_timeout = cluster.create_timeout + else: + heat_timeout = cfg.CONF.cluster_heat.create_timeout + + stack_params['is_cluster_stack'] = False + + first_master = cluster.master_addresses[0] + network = osc.heat().resources.get(cluster.stack_id, 'network') + secgroup = osc.heat().resources.get(cluster.stack_id, + 'secgroup_kube_minion') + stack_params['existing_master_private_ip'] = first_master + stack_params['existing_security_group'] = secgroup.attributes['id'] + stack_params['fixed_network'] = network.attributes['fixed_network'] + stack_params['fixed_subnet'] = network.attributes['fixed_subnet'] + stack_params['trust_id'] = cluster.trust_id + stack_params['trustee_password'] = cluster.trustee_password + + for label in nodegroup.labels: + stack_params[label] = nodegroup.labels[label] + + fields = { + 'stack_name': stack_name, + 'parameters': stack_params, + 'environment_files': environment_files, + 'template': template, + 'files': tpl_files, + 'timeout_mins': heat_timeout + } + created_stack = osc.heat().stacks.create(**fields) + nodegroup.stack_id = created_stack['stack']['id'] + return created_stack + + def delete_nodegroup(self, context, cluster, nodegroup): + # Default nodegroups share stack_id so it will be deleted + # as soon as the cluster gets destroyed + if not nodegroup.stack_id: + nodegroup.destroy() + else: + osc = clients.OpenStackClients(context) + self._delete_stack(context, osc, nodegroup.stack_id) diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml index 34afa89c4b..3285e32d9d 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml @@ -1,12 +1,53 @@ -heat_template_version: 2014-10-16 +heat_template_version: queens description: > This template will boot a Kubernetes cluster with one or more minions (as specified by the number_of_minions parameter, which defaults to 1). +conditions: + master_only: + or: + - equals: + - get_param: role + - "master" + - equals: + - get_param: is_cluster_stack + - true + + worker_only: + or: + - equals: + - get_param: role + - "worker" + - equals: + - get_param: is_cluster_stack + - true + + create_cluster_resources: + equals: + - get_param: is_cluster_stack + - true + parameters: + # needs to become a list if we want to join master nodes? + existing_master_private_ip: + type: string + default: "" + + is_cluster_stack: + type: boolean + default: false + + role: + type: string + default: "" + + existing_security_group: + type: string + default: "" + ssh_key_name: type: string description: name of ssh key to be provisioned on our server @@ -33,10 +74,16 @@ parameters: master_image: type: string description: glance image used to boot the server + # When creating a new minion nodegroup this will not + # be provided by magnum. So make it default to "" + default: "" minion_image: type: string description: glance image used to boot the server + # When creating a new master nodegroup this will not + # be provided by magnum. So make it default to "" + default: "" master_flavor: type: string @@ -660,6 +707,7 @@ resources: # network: + condition: create_cluster_resources type: ../../common/templates/network.yaml properties: existing_network: {get_param: fixed_network} @@ -670,6 +718,7 @@ resources: private_network_name: private api_lb: + condition: create_cluster_resources type: ../../common/templates/lb_api.yaml properties: fixed_subnet: {get_attr: [network, fixed_subnet]} @@ -678,6 +727,7 @@ resources: port: {get_param: kubernetes_port} etcd_lb: + condition: create_cluster_resources type: ../../common/templates/lb_etcd.yaml properties: fixed_subnet: {get_attr: [network, fixed_subnet]} @@ -691,6 +741,7 @@ resources: # secgroup_kube_master: + condition: create_cluster_resources type: OS::Neutron::SecurityGroup properties: rules: @@ -727,6 +778,7 @@ resources: port_range_max: 8472 secgroup_kube_minion: + condition: create_cluster_resources type: OS::Neutron::SecurityGroup properties: rules: @@ -756,6 +808,7 @@ resources: # allow any traffic between worker nodes secgroup_rule_tcp_kube_minion: + condition: create_cluster_resources type: OS::Neutron::SecurityGroupRule properties: protocol: tcp @@ -764,6 +817,7 @@ resources: security_group: {get_resource: secgroup_kube_minion} remote_group: {get_resource: secgroup_kube_minion} secgroup_rule_udp_kube_minion: + condition: create_cluster_resources type: OS::Neutron::SecurityGroupRule properties: protocol: udp @@ -779,6 +833,7 @@ resources: # api_address_lb_switch: + condition: create_cluster_resources type: Magnum::ApiGatewaySwitcher properties: pool_public_ip: {get_attr: [api_lb, floating_address]} @@ -787,6 +842,7 @@ resources: master_private_ip: {get_attr: [kube_masters, resource.0.kube_master_ip]} etcd_address_lb_switch: + condition: create_cluster_resources type: Magnum::ApiGatewaySwitcher properties: pool_private_ip: {get_attr: [etcd_lb, address]} @@ -799,6 +855,7 @@ resources: # api_address_floating_switch: + condition: create_cluster_resources type: Magnum::FloatingIPAddressSwitcher properties: public_ip: {get_attr: [api_address_lb_switch, public_ip]} @@ -811,11 +868,13 @@ resources: # master_nodes_server_group: + condition: master_only type: OS::Nova::ServerGroup properties: policies: [{get_param: nodes_affinity_policy}] worker_nodes_server_group: + condition: worker_only type: OS::Nova::ServerGroup properties: policies: [{get_param: nodes_affinity_policy}] @@ -827,6 +886,7 @@ resources: # kube_masters: + condition: master_only type: OS::Heat::ResourceGroup depends_on: - network @@ -943,6 +1003,7 @@ resources: max_node_count: {get_param: max_node_count} kube_cluster_config: + condition: create_cluster_resources type: OS::Heat::SoftwareConfig properties: group: script @@ -985,6 +1046,7 @@ resources: - get_file: ../../common/templates/kubernetes/fragments/install-helm-modules.sh kube_cluster_deploy: + condition: create_cluster_resources type: OS::Heat::SoftwareDeployment properties: actions: ['CREATE'] @@ -1002,6 +1064,7 @@ resources: # kube_minions: + condition: worker_only type: OS::Heat::ResourceGroup depends_on: - network @@ -1021,12 +1084,32 @@ resources: ssh_key_name: {get_param: ssh_key_name} server_image: {get_param: minion_image} minion_flavor: {get_param: minion_flavor} - fixed_network: {get_attr: [network, fixed_network]} - fixed_subnet: {get_attr: [network, fixed_subnet]} +# fixed_network: {get_param: fixed_network} + fixed_network: + if: + - create_cluster_resources + - get_attr: [network, fixed_network] + - get_param: fixed_network +# fixed_subnet: {get_param: fixed_subnet} + fixed_subnet: + if: + - create_cluster_resources + - get_attr: [network, fixed_subnet] + - get_param: fixed_subnet network_driver: {get_param: network_driver} flannel_network_cidr: {get_param: flannel_network_cidr} - kube_master_ip: {get_attr: [api_address_lb_switch, private_ip]} - etcd_server_ip: {get_attr: [etcd_address_lb_switch, private_ip]} + #kube_master_ip: {get_param: existing_master_private_ip} + kube_master_ip: + if: + - create_cluster_resources + - get_attr: [api_address_lb_switch, private_ip] + - get_param: existing_master_private_ip + #etcd_server_ip: {get_param: existing_master_private_ip} + etcd_server_ip: + if: + - create_cluster_resources + - get_attr: [etcd_address_lb_switch, private_ip] + - get_param: existing_master_private_ip external_network: {get_param: external_network} kube_allow_priv: {get_param: kube_allow_priv} docker_volume_size: {get_param: docker_volume_size} @@ -1050,7 +1133,12 @@ resources: kubernetes_port: {get_param: kubernetes_port} tls_disabled: {get_param: tls_disabled} verify_ca: {get_param: verify_ca} - secgroup_kube_minion_id: {get_resource: secgroup_kube_minion} +# secgroup_kube_minion_id: {get_param: existing_security_group} + secgroup_kube_minion_id: + if: + - create_cluster_resources + - get_resource: secgroup_kube_minion + - get_param: existing_security_group http_proxy: {get_param: http_proxy} https_proxy: {get_param: https_proxy} no_proxy: {get_param: no_proxy} @@ -1079,6 +1167,7 @@ resources: outputs: api_address: + condition: create_cluster_resources value: str_replace: template: api_ip_address @@ -1089,6 +1178,7 @@ outputs: the Kubernetes API. registry_address: + condition: create_cluster_resources value: str_replace: template: localhost:port @@ -1099,22 +1189,26 @@ outputs: images. kube_masters_private: + condition: master_only value: {get_attr: [kube_masters, kube_master_ip]} description: > This is a list of the "private" IP addresses of all the Kubernetes masters. kube_masters: + condition: master_only value: {get_attr: [kube_masters, kube_master_external_ip]} description: > This is a list of the "public" IP addresses of all the Kubernetes masters. Use these IP addresses to log in to the Kubernetes masters via ssh. kube_minions_private: + condition: worker_only value: {get_attr: [kube_minions, kube_minion_ip]} description: > This is a list of the "private" IP addresses of all the Kubernetes minions. kube_minions: + condition: worker_only value: {get_attr: [kube_minions, kube_minion_external_ip]} description: > This is a list of the "public" IP addresses of all the Kubernetes minions. diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml index fb48d6b06f..01acef07a0 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubemaster.yaml @@ -1,4 +1,4 @@ -heat_template_version: 2014-10-16 +heat_template_version: queens description: > This is a nested stack that defines a single Kubernetes master, This stack is diff --git a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml index 8c86996841..6ac2861fd0 100644 --- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml +++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubeminion.yaml @@ -1,4 +1,4 @@ -heat_template_version: 2014-10-16 +heat_template_version: queens description: > This is a nested stack that defines a single Kubernetes minion, This stack is diff --git a/magnum/service/periodic.py b/magnum/service/periodic.py index 5ea21f2b0a..d8a095acca 100755 --- a/magnum/service/periodic.py +++ b/magnum/service/periodic.py @@ -164,7 +164,9 @@ class MagnumPeriodicTasks(periodic_task.PeriodicTasks): objects.fields.ClusterStatus.DELETE_IN_PROGRESS, objects.fields.ClusterStatus.ROLLBACK_IN_PROGRESS] filters = {'status': status} - clusters = objects.Cluster.list(ctx, filters=filters) + nodegroups = objects.NodeGroup.list(ctx, filters=filters) + cluster_ids = set(ng.cluster_id for ng in nodegroups) + clusters = [objects.Cluster.get(ctx, cid) for cid in cluster_ids] if not clusters: return