ng-9: Driver for nodegroup operations

This adds the support for creating and deleting worker nodegroups using different stack per nodegroup. In order to be backwards compatible, default nodegroups will remain in one stack. Having this in mind cluster status is now calculated aggregating the statuses of the underlying stacks. Change-Id: I97839ab8495ed5d860785dff1f6e3cc59b6a9ff7
2019-06-24 13:49:42 +02:00 · 2019-06-24 13:49:42 +02:00 · e52f77b299
commit e52f77b299
parent 5027e0daf8
13 changed files with 916 additions and 198 deletions
--- a/magnum/drivers/heat/driver.py
+++ b/magnum/drivers/heat/driver.py
@ -11,6 +11,7 @@
 # under the License.

 import abc
+import collections
 import os
 import six

@ -41,6 +42,10 @@ from magnum.objects import fields
 LOG = logging.getLogger(__name__)


+NodeGroupStatus = collections.namedtuple('NodeGroupStatus',
+                                         'name status reason is_default')
+
+
@six.add_metaclass(abc.ABCMeta)
 class HeatDriver(driver.Driver):
    """Base Driver class for using Heat
@ -61,12 +66,14 @@ class HeatDriver(driver.Driver):
                                             scale_manager=scale_manager)

    def _extract_template_definition(self, context, cluster,
-                                     scale_manager=None):
+                                     scale_manager=None,
+                                     nodegroups=None):
        cluster_template = conductor_utils.retrieve_cluster_template(context,
                                                                     cluster)
        definition = self.get_template_definition()
        return definition.extract_definition(context, cluster_template,
                                             cluster,
+                                             nodegroups=nodegroups,
                                             scale_manager=scale_manager)

    def _get_env_files(self, template_path, env_rel_paths):
@ -96,14 +103,20 @@ class HeatDriver(driver.Driver):
    def delete_federation(self, context, federation):
        return NotImplementedError("Must implement 'delete_federation'")

-    def create_nodegroup(self, context, cluster, nodegroup):
-        raise NotImplementedError("Must implement 'create_nodegroup'.")
-
    def update_nodegroup(self, context, cluster, nodegroup):
-        raise NotImplementedError("Must implement 'update_nodegroup'.")
+        # we just need to save the nodegroup here. This is because,
+        # at the moment, this method is used to update min and max node
+        # counts.
+        nodegroup.save()

    def delete_nodegroup(self, context, cluster, nodegroup):
-        raise NotImplementedError("Must implement 'delete_nodegroup'.")
+        # Default nodegroups share stack_id so it will be deleted
+        # as soon as the cluster gets destroyed
+        if not nodegroup.stack_id:
+            nodegroup.destroy()
+        else:
+            osc = clients.OpenStackClients(context)
+            self._delete_stack(context, osc, nodegroup.stack_id)

    def update_cluster_status(self, context, cluster):
        if cluster.stack_id is None:
@ -128,6 +141,16 @@ class HeatDriver(driver.Driver):
                       rollback=False):
        self._update_stack(context, cluster, scale_manager, rollback)

+    def create_nodegroup(self, context, cluster, nodegroup):
+        stack = self._create_stack(context, clients.OpenStackClients(context),
+                                   cluster, cluster.create_timeout,
+                                   nodegroup=nodegroup)
+        nodegroup.stack_id = stack['stack']['id']
+
+    def get_nodegroup_extra_params(self, cluster, osc):
+        raise NotImplementedError("Must implement "
+                                  "'get_nodegroup_extra_params'")
+
    @abc.abstractmethod
    def upgrade_cluster(self, context, cluster, cluster_template,
                        max_batch_size, nodegroup, scale_manager=None,
@ -138,7 +161,14 @@ class HeatDriver(driver.Driver):
        self.pre_delete_cluster(context, cluster)

        LOG.info("Starting to delete cluster %s", cluster.uuid)
-        self._delete_stack(context, clients.OpenStackClients(context), cluster)
+        osc = clients.OpenStackClients(context)
+        for ng in cluster.nodegroups:
+            ng.status = fields.ClusterStatus.DELETE_IN_PROGRESS
+            ng.save()
+            if ng.is_default:
+                continue
+            self._delete_stack(context, osc, ng.stack_id)
+        self._delete_stack(context, osc, cluster.default_ng_master.stack_id)

    def resize_cluster(self, context, cluster, resize_manager,
                       node_count, nodes_to_remove, nodegroup=None,
@ -147,9 +177,13 @@ class HeatDriver(driver.Driver):
                           node_count, nodes_to_remove, nodegroup=nodegroup,
                           rollback=rollback)

-    def _create_stack(self, context, osc, cluster, cluster_create_timeout):
+    def _create_stack(self, context, osc, cluster, cluster_create_timeout,
+                      nodegroup=None):
+
+        nodegroups = [nodegroup] if nodegroup else None
        template_path, heat_params, env_files = (
-            self._extract_template_definition(context, cluster))
+            self._extract_template_definition(context, cluster,
+                                              nodegroups=nodegroups))

        tpl_files, template = template_utils.get_template_contents(
            template_path)
@ -163,7 +197,10 @@ class HeatDriver(driver.Driver):

        # valid hostnames are 63 chars long, leaving enough room
        # to add the random id (for uniqueness)
-        stack_name = cluster.name[:30]
+        if nodegroup is None:
+            stack_name = cluster.name[:30]
+        else:
+            stack_name = "%s-%s" % (cluster.name[:20], nodegroup.name[:9])
        stack_name = stack_name.replace('_', '-')
        stack_name = stack_name.replace('.', '-')
        stack_name = ''.join(filter(valid_chars.__contains__, stack_name))
@ -177,6 +214,14 @@ class HeatDriver(driver.Driver):
            # no cluster_create_timeout value was passed in to the request
            # so falling back on configuration file value
            heat_timeout = cfg.CONF.cluster_heat.create_timeout
+
+        heat_params['is_cluster_stack'] = nodegroup is None
+
+        if nodegroup:
+            # In case we are creating a new stack for a new nodegroup then
+            # we need to extract more params.
+            heat_params.update(self.get_nodegroup_extra_params(cluster, osc))
+
        fields = {
            'stack_name': stack_name,
            'parameters': heat_params,
@ -225,10 +270,10 @@ class HeatDriver(driver.Driver):

        # Find what changed checking the stack params
        # against the ones in the template_def.
-        stack = osc.heat().stacks.get(cluster.stack_id,
+        stack = osc.heat().stacks.get(nodegroup.stack_id,
                                      resolve_outputs=True)
        stack_params = stack.parameters
-        definition.add_nodegroup_params(cluster)
+        definition.add_nodegroup_params(cluster, nodegroups=[nodegroup])
        heat_params = definition.get_stack_diff(context, stack_params, cluster)
        LOG.debug('Updating stack with these params: %s', heat_params)

@ -244,10 +289,10 @@ class HeatDriver(driver.Driver):
        }

        osc = clients.OpenStackClients(context)
-        osc.heat().stacks.update(cluster.stack_id, **fields)
+        osc.heat().stacks.update(nodegroup.stack_id, **fields)

-    def _delete_stack(self, context, osc, cluster):
-        osc.heat().stacks.delete(cluster.stack_id)
+    def _delete_stack(self, context, osc, stack_id):
+        osc.heat().stacks.delete(stack_id)


 class KubernetesDriver(HeatDriver):
@ -288,39 +333,123 @@ class HeatPoller(object):
    def poll_and_check(self):
        # TODO(yuanying): temporary implementation to update api_address,
        # node_addresses and cluster status
+        ng_statuses = list()
+        self.default_ngs = list()
+        for nodegroup in self.cluster.nodegroups:
+            self.nodegroup = nodegroup
+            if self.nodegroup.is_default:
+                self.default_ngs.append(self.nodegroup)
+            status = self.extract_nodegroup_status()
+            # In case a non-default nodegroup is deleted, None
+            # is returned. We shouldn't add None in the list
+            if status is not None:
+                ng_statuses.append(status)
+        self.aggregate_nodegroup_statuses(ng_statuses)
+
+    def extract_nodegroup_status(self):
+
+        if self.nodegroup.stack_id is None:
+            # There is a slight window for a race condition here. If
+            # a nodegroup is created and just before the stack_id is
+            # assigned to it, this periodic task is executed, the
+            # periodic task would try to find the status of the
+            # stack with id = None. At that time the nodegroup status
+            # is already set to CREATE_IN_PROGRESS by the conductor.
+            # Keep this status for this loop until the stack_id is assigned.
+            return NodeGroupStatus(name=self.nodegroup.name,
+                                   status=self.nodegroup.status,
+                                   is_default=self.nodegroup.is_default,
+                                   reason=self.nodegroup.status_reason)
+
        try:
            # Do not resolve outputs by default. Resolving all
            # node IPs is expensive on heat.
            stack = self.openstack_client.heat().stacks.get(
-                self.cluster.stack_id, resolve_outputs=False)
+                self.nodegroup.stack_id, resolve_outputs=False)
+
+            # poll_and_check is detached and polling long time to check
+            # status, so another user/client can call delete cluster/stack.
+            if stack.stack_status == fields.ClusterStatus.DELETE_COMPLETE:
+                if self.nodegroup.is_default:
+                    self._check_delete_complete()
+                else:
+                    self.nodegroup.destroy()
+                    return
+
+            if stack.stack_status in (fields.ClusterStatus.CREATE_COMPLETE,
+                                      fields.ClusterStatus.UPDATE_COMPLETE):
+                # Resolve all outputs if the stack is COMPLETE
+                stack = self.openstack_client.heat().stacks.get(
+                    self.nodegroup.stack_id, resolve_outputs=True)
+
+                self._sync_cluster_and_template_status(stack)
+            elif stack.stack_status != self.nodegroup.status:
+                self.template_def.nodegroup_output_mappings = list()
+                self.template_def.update_outputs(
+                    stack, self.cluster_template, self.cluster,
+                    nodegroups=[self.nodegroup])
+                self._sync_cluster_status(stack)
+
+            if stack.stack_status in (fields.ClusterStatus.CREATE_FAILED,
+                                      fields.ClusterStatus.DELETE_FAILED,
+                                      fields.ClusterStatus.UPDATE_FAILED,
+                                      fields.ClusterStatus.ROLLBACK_COMPLETE,
+                                      fields.ClusterStatus.ROLLBACK_FAILED):
+                self._sync_cluster_and_template_status(stack)
+                self._nodegroup_failed(stack)
        except heatexc.NotFound:
            self._sync_missing_heat_stack()
+        return NodeGroupStatus(name=self.nodegroup.name,
+                               status=self.nodegroup.status,
+                               is_default=self.nodegroup.is_default,
+                               reason=self.nodegroup.status_reason)
+
+    def aggregate_nodegroup_statuses(self, ng_statuses):
+        # NOTE(ttsiouts): Aggregate the nodegroup statuses and set the
+        # cluster overall status.
+        FAILED = '_FAILED'
+        IN_PROGRESS = '_IN_PROGRESS'
+        COMPLETE = '_COMPLETE'
+        UPDATE = 'UPDATE'
+
+        previous_state = self.cluster.status
+        self.cluster.status_reason = None
+
+        # Both default nodegroups will have the same status so it's
+        # enough to check one of them.
+        self.cluster.status = self.cluster.default_ng_master.status
+        default_ng = self.cluster.default_ng_master
+        if (default_ng.status.endswith(IN_PROGRESS) or
+                default_ng.status == fields.ClusterStatus.DELETE_COMPLETE):
+            self.cluster.save()
            return

-        # poll_and_check is detached and polling long time to check status,
-        # so another user/client can call delete cluster/stack.
-        if stack.stack_status == fields.ClusterStatus.DELETE_COMPLETE:
-            self._delete_complete()
+        # Keep priority to the states below
+        for state in (IN_PROGRESS, FAILED, COMPLETE):
+            if any(ns.status.endswith(state) for ns in ng_statuses
+                   if not ns.is_default):
+                status = getattr(fields.ClusterStatus, UPDATE+state)
+                self.cluster.status = status
+                if state == FAILED:
+                    reasons = ["%s failed" % (ns.name)
+                               for ns in ng_statuses
+                               if ns.status.endswith(FAILED)]
+                    self.cluster.status_reason = ' ,'.join(reasons)
+                break

-        if stack.stack_status in (fields.ClusterStatus.CREATE_COMPLETE,
-                                  fields.ClusterStatus.UPDATE_COMPLETE):
-            # Resolve all outputs if the stack is COMPLETE
-            stack = self.openstack_client.heat().stacks.get(
-                self.cluster.stack_id, resolve_outputs=True)
+        if self.cluster.status == fields.ClusterStatus.CREATE_COMPLETE:
+            # Consider the scenario where the user:
+            # - creates the cluster (cluster: create_complete)
+            # - adds a nodegroup (cluster: update_complete)
+            # - deletes the nodegroup
+            # The cluster should go to CREATE_COMPLETE only if the previous
+            # state was CREATE_COMPLETE or CREATE_IN_PROGRESS. In all other
+            # cases, just go to UPDATE_COMPLETE.
+            if previous_state not in (fields.ClusterStatus.CREATE_COMPLETE,
+                                      fields.ClusterStatus.CREATE_IN_PROGRESS):
+                self.cluster.status = fields.ClusterStatus.UPDATE_COMPLETE

-            self._sync_cluster_and_template_status(stack)
-        elif stack.stack_status != self.cluster.status:
-            self.template_def.update_outputs(stack, self.cluster_template,
-                                             self.cluster)
-            self._sync_cluster_status(stack)
-
-        if stack.stack_status in (fields.ClusterStatus.CREATE_FAILED,
-                                  fields.ClusterStatus.DELETE_FAILED,
-                                  fields.ClusterStatus.UPDATE_FAILED,
-                                  fields.ClusterStatus.ROLLBACK_COMPLETE,
-                                  fields.ClusterStatus.ROLLBACK_FAILED):
-            self._sync_cluster_and_template_status(stack)
-            self._cluster_failed(stack)
+        self.cluster.save()

    def _delete_complete(self):
        LOG.info('Cluster has been deleted, stack_id: %s',
@ -339,9 +468,9 @@ class HeatPoller(object):
                     self.cluster.uuid)

    def _sync_cluster_status(self, stack):
-        self.cluster.status = stack.stack_status
-        self.cluster.status_reason = stack.stack_status_reason
-        self.cluster.save()
+        self.nodegroup.status = stack.stack_status
+        self.nodegroup.status_reason = stack.stack_status_reason
+        self.nodegroup.save()

    def get_version_info(self, stack):
        stack_param = self.template_def.get_heat_param(
@ -358,34 +487,44 @@ class HeatPoller(object):
        self.cluster.container_version = container_version

    def _sync_cluster_and_template_status(self, stack):
+        self.template_def.nodegroup_output_mappings = list()
        self.template_def.update_outputs(stack, self.cluster_template,
-                                         self.cluster)
+                                         self.cluster,
+                                         nodegroups=[self.nodegroup])
        self.get_version_info(stack)
        self._sync_cluster_status(stack)

-    def _cluster_failed(self, stack):
-        LOG.error('Cluster error, stack status: %(cluster_status)s, '
+    def _nodegroup_failed(self, stack):
+        LOG.error('Nodegroup error, stack status: %(ng_status)s, '
                  'stack_id: %(stack_id)s, '
                  'reason: %(reason)s',
-                  {'cluster_status': stack.stack_status,
-                   'stack_id': self.cluster.stack_id,
-                   'reason': self.cluster.status_reason})
+                  {'ng_status': stack.stack_status,
+                   'stack_id': self.nodegroup.stack_id,
+                   'reason': self.nodegroup.status_reason})

    def _sync_missing_heat_stack(self):
-        if self.cluster.status == fields.ClusterStatus.DELETE_IN_PROGRESS:
-            self._delete_complete()
-        elif self.cluster.status == fields.ClusterStatus.CREATE_IN_PROGRESS:
+        if self.nodegroup.status == fields.ClusterStatus.DELETE_IN_PROGRESS:
+            self._sync_missing_stack(fields.ClusterStatus.DELETE_COMPLETE)
+            if self.nodegroup.is_default:
+                self._check_delete_complete()
+        elif self.nodegroup.status == fields.ClusterStatus.CREATE_IN_PROGRESS:
            self._sync_missing_stack(fields.ClusterStatus.CREATE_FAILED)
-        elif self.cluster.status == fields.ClusterStatus.UPDATE_IN_PROGRESS:
+        elif self.nodegroup.status == fields.ClusterStatus.UPDATE_IN_PROGRESS:
            self._sync_missing_stack(fields.ClusterStatus.UPDATE_FAILED)

+    def _check_delete_complete(self):
+        default_ng_statuses = [ng.status for ng in self.default_ngs]
+        if all(status == fields.ClusterStatus.DELETE_COMPLETE
+               for status in default_ng_statuses):
+            self._delete_complete()
+
    def _sync_missing_stack(self, new_status):
-        self.cluster.status = new_status
-        self.cluster.status_reason = _("Stack with id %s not found in "
-                                       "Heat.") % self.cluster.stack_id
-        self.cluster.save()
-        LOG.info("Cluster with id %(id)s has been set to "
+        self.nodegroup.status = new_status
+        self.nodegroup.status_reason = _("Stack with id %s not found in "
+                                         "Heat.") % self.cluster.stack_id
+        self.nodegroup.save()
+        LOG.info("Nodegroup with id %(id)s has been set to "
                 "%(status)s due to stack with id %(sid)s "
                 "not found in Heat.",
-                 {'id': self.cluster.id, 'status': self.cluster.status,
-                  'sid': self.cluster.stack_id})
+                 {'id': self.nodegroup.uuid, 'status': self.nodegroup.status,
+                  'sid': self.nodegroup.stack_id})
--- a/magnum/drivers/heat/k8s_coreos_template_def.py
+++ b/magnum/drivers/heat/k8s_coreos_template_def.py
@ -65,8 +65,11 @@ class CoreOSK8sTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
                      'calico_tag',
                      'calico_kube_controllers_tag', 'calico_ipv4pool',
                      'etcd_tag', 'flannel_tag']
+
+        labels = self._get_relevant_labels(cluster, kwargs)
+
        for label in label_list:
-            label_value = cluster.labels.get(label)
+            label_value = labels.get(label)
            if label_value:
                extra_params[label] = label_value

--- a/magnum/drivers/heat/k8s_fedora_template_def.py
+++ b/magnum/drivers/heat/k8s_fedora_template_def.py
@ -100,8 +100,10 @@ class K8sFedoraTemplateDefinition(k8s_template_def.K8sTemplateDefinition):
                      'draino_tag', 'autoscaler_tag',
                      'min_node_count', 'max_node_count', 'npd_enabled']

+        labels = self._get_relevant_labels(cluster, kwargs)
+
        for label in label_list:
-            label_value = cluster.labels.get(label)
+            label_value = labels.get(label)
            if label_value:
                extra_params[label] = label_value

--- a/magnum/drivers/heat/k8s_template_def.py
+++ b/magnum/drivers/heat/k8s_template_def.py
@ -212,8 +212,10 @@ class K8sTemplateDefinition(template_def.BaseTemplateDefinition):
                      'kubescheduler_options',
                      'influx_grafana_dashboard_enabled']

+        labels = self._get_relevant_labels(cluster, kwargs)
+
        for label in label_list:
-            extra_params[label] = cluster.labels.get(label)
+            extra_params[label] = labels.get(label)

        ingress_controller = cluster.labels.get('ingress_controller',
                                                '').lower()
@ -233,7 +235,7 @@ class K8sTemplateDefinition(template_def.BaseTemplateDefinition):
            extra_params['registry_container'] = (
                CONF.docker_registry.swift_registry_container)

-        kube_tag = (cluster.labels.get("kube_tag") or
+        kube_tag = (labels.get("kube_tag") or
                    cluster_template.labels.get("kube_tag"))
        if kube_tag:
            extra_params['kube_version'] = kube_tag
--- a/magnum/drivers/heat/swarm_fedora_template_def.py
+++ b/magnum/drivers/heat/swarm_fedora_template_def.py
@ -141,8 +141,10 @@ class SwarmFedoraTemplateDefinition(template_def.BaseTemplateDefinition):
            'docker_volume_type', CONF.cinder.default_docker_volume_type)
        extra_params['docker_volume_type'] = docker_volume_type

+        labels = self._get_relevant_labels(cluster, kwargs)
+
        for label in label_list:
-            extra_params[label] = cluster.labels.get(label)
+            extra_params[label] = labels.get(label)

        if cluster_template.registry_enabled:
            extra_params['swift_region'] = CONF.docker_registry.swift_region
--- a/magnum/drivers/heat/swarm_mode_template_def.py
+++ b/magnum/drivers/heat/swarm_mode_template_def.py
@ -127,8 +127,10 @@ class SwarmModeTemplateDefinition(template_def.BaseTemplateDefinition):
        extra_params['nodes_affinity_policy'] = \
            CONF.cluster.nodes_affinity_policy

+        labels = self._get_relevant_labels(cluster, kwargs)
+
        for label in label_list:
-            extra_params[label] = cluster.labels.get(label)
+            extra_params[label] = labels.get(label)

        # set docker_volume_type
        # use the configuration default if None provided
--- a/magnum/drivers/heat/template_def.py
+++ b/magnum/drivers/heat/template_def.py
@ -158,6 +158,10 @@ class NodeGroupOutputMapping(OutputMapping):
                # nodegroups are fetched from the database every
                # time, so the bad thing here is that we need to
                # save each change.
+                previous_value = getattr(ng, self.nodegroup_attr, None)
+                if previous_value == output_value:
+                    # Avoid saving if it's not needed.
+                    return
                setattr(ng, self.nodegroup_attr, output_value)
                ng.save()

@ -426,6 +430,13 @@ class BaseTemplateDefinition(TemplateDefinition):
                               nodegroup_uuid=nodegroup.uuid,
                               param_class=NodeGroupParameterMapping)

+    def _get_relevant_labels(self, cluster, kwargs):
+        nodegroups = kwargs.get('nodegroups', None)
+        labels = cluster.labels
+        if nodegroups is not None:
+            labels = nodegroups[0].labels
+        return labels
+
    def update_outputs(self, stack, cluster_template, cluster,
                       nodegroups=None):
        master_ng = cluster.default_ng_master
--- a/magnum/drivers/k8s_fedora_atomic_v1/driver.py
+++ b/magnum/drivers/k8s_fedora_atomic_v1/driver.py
@ -95,3 +95,19 @@ class Driver(driver.KubernetesDriver):
            'disable_rollback': not rollback
        }
        osc.heat().stacks.update(cluster.stack_id, **fields)
+
+    def get_nodegroup_extra_params(self, cluster, osc):
+        network = osc.heat().resources.get(cluster.stack_id, 'network')
+        secgroup = osc.heat().resources.get(cluster.stack_id,
+                                            'secgroup_kube_minion')
+        for output in osc.heat().stacks.get(cluster.stack_id).outputs:
+            if output['output_key'] == 'api_address':
+                api_address = output['output_value']
+                break
+        extra_params = {
+            'existing_master_private_ip': api_address,
+            'existing_security_group': secgroup.attributes['id'],
+            'fixed_network': network.attributes['fixed_network'],
+            'fixed_subnet': network.attributes['fixed_subnet'],
+        }
+        return extra_params
--- a/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml
+++ b/magnum/drivers/k8s_fedora_atomic_v1/templates/kubecluster.yaml
@ -1,12 +1,53 @@
-heat_template_version: 2015-04-30
+heat_template_version: queens

 description: >
  This template will boot a Kubernetes cluster with one or more
  minions (as specified by the number_of_minions parameter, which
  defaults to 1).

+conditions:
+  master_only:
+    or:
+    - equals:
+      - get_param: role
+      - "master"
+    - equals:
+      - get_param: is_cluster_stack
+      - true
+
+  worker_only:
+    or:
+    - equals:
+      - get_param: role
+      - "worker"
+    - equals:
+      - get_param: is_cluster_stack
+      - true
+
+  create_cluster_resources:
+    equals:
+    - get_param: is_cluster_stack
+    - true
+
 parameters:

+  # needs to become a list if we want to join master nodes?
+  existing_master_private_ip:
+    type: string
+    default: ""
+
+  is_cluster_stack:
+    type: boolean
+    default: false
+
+  role:
+    type: string
+    default: ""
+
+  existing_security_group:
+    type: string
+    default: ""
+
  ssh_key_name:
    type: string
    description: name of ssh key to be provisioned on our server
@ -34,10 +75,16 @@ parameters:
  master_image:
    type: string
    description: glance image used to boot the server
+    # When creating a new minion nodegroup this will not
+    # be provided by magnum. So make it default to ""
+    default: ""

  minion_image:
    type: string
    description: glance image used to boot the server
+    # When creating a new master nodegroup this will not
+    # be provided by magnum. So make it default to ""
+    default: ""

  master_flavor:
    type: string
@ -693,6 +740,7 @@ resources:
  #

  network:
+    condition: create_cluster_resources
    type: ../../common/templates/network.yaml
    properties:
      existing_network: {get_param: fixed_network}
@ -703,6 +751,7 @@ resources:
      private_network_name: {get_param: fixed_network_name}

  api_lb:
+    condition: create_cluster_resources
    type: ../../common/templates/lb_api.yaml
    properties:
      fixed_subnet: {get_attr: [network, fixed_subnet]}
@ -711,6 +760,7 @@ resources:
      port: {get_param: kubernetes_port}

  etcd_lb:
+    condition: create_cluster_resources
    type: ../../common/templates/lb_etcd.yaml
    properties:
      fixed_subnet: {get_attr: [network, fixed_subnet]}
@ -724,6 +774,7 @@ resources:
  #

  secgroup_kube_master:
+    condition: create_cluster_resources
    type: OS::Neutron::SecurityGroup
    properties:
      rules:
@ -760,6 +811,7 @@ resources:
          port_range_max: 8472

  secgroup_kube_minion:
+    condition: create_cluster_resources
    type: OS::Neutron::SecurityGroup
    properties:
      rules:
@ -789,6 +841,7 @@ resources:

  # allow any traffic between worker nodes
  secgroup_rule_tcp_kube_minion:
+    condition: create_cluster_resources
    type: OS::Neutron::SecurityGroupRule
    properties:
      protocol: tcp
@ -797,6 +850,7 @@ resources:
      security_group: {get_resource: secgroup_kube_minion}
      remote_group: {get_resource: secgroup_kube_minion}
  secgroup_rule_udp_kube_minion:
+    condition: create_cluster_resources
    type: OS::Neutron::SecurityGroupRule
    properties:
      protocol: udp
@ -812,6 +866,7 @@ resources:
  #

  api_address_lb_switch:
+    condition: create_cluster_resources
    type: Magnum::ApiGatewaySwitcher
    properties:
      pool_public_ip: {get_attr: [api_lb, floating_address]}
@ -820,6 +875,7 @@ resources:
      master_private_ip: {get_attr: [kube_masters, resource.0.kube_master_ip]}

  etcd_address_lb_switch:
+    condition: create_cluster_resources
    type: Magnum::ApiGatewaySwitcher
    properties:
      pool_private_ip: {get_attr: [etcd_lb, address]}
@ -832,6 +888,7 @@ resources:
  #

  api_address_floating_switch:
+    condition: create_cluster_resources
    type: Magnum::FloatingIPAddressSwitcher
    properties:
      public_ip: {get_attr: [api_address_lb_switch, public_ip]}
@ -844,11 +901,13 @@ resources:
  #

  master_nodes_server_group:
+    condition: master_only
    type: OS::Nova::ServerGroup
    properties:
      policies: [{get_param: nodes_affinity_policy}]

  worker_nodes_server_group:
+    condition: worker_only
    type: OS::Nova::ServerGroup
    properties:
      policies: [{get_param: nodes_affinity_policy}]
@ -860,6 +919,7 @@ resources:
  #

  kube_masters:
+    condition: master_only
    type: OS::Heat::ResourceGroup
    depends_on:
      - network
@ -983,6 +1043,7 @@ resources:
          npd_enabled: {get_param: npd_enabled}

  kube_cluster_config:
+    condition: create_cluster_resources
    type: OS::Heat::SoftwareConfig
    properties:
      group: script
@ -1025,6 +1086,7 @@ resources:
            - get_file: ../../common/templates/kubernetes/fragments/install-helm-modules.sh

  kube_cluster_deploy:
+    condition: create_cluster_resources
    type: OS::Heat::SoftwareDeployment
    properties:
      actions: ['CREATE']
@ -1042,6 +1104,7 @@ resources:
  #

  kube_minions:
+    condition: worker_only
    type: OS::Heat::ResourceGroup
    depends_on:
      - network
@ -1061,12 +1124,28 @@ resources:
          ssh_key_name: {get_param: ssh_key_name}
          server_image: {get_param: minion_image}
          minion_flavor: {get_param: minion_flavor}
-          fixed_network: {get_attr: [network, fixed_network]}
-          fixed_subnet: {get_attr: [network, fixed_subnet]}
+          fixed_network:
+            if:
+              - create_cluster_resources
+              - get_attr: [network, fixed_network]
+              - get_param: fixed_network
+          fixed_subnet:
+            if:
+              - create_cluster_resources
+              - get_attr: [network, fixed_subnet]
+              - get_param: fixed_subnet
          network_driver: {get_param: network_driver}
          flannel_network_cidr: {get_param: flannel_network_cidr}
-          kube_master_ip: {get_attr: [api_address_lb_switch, private_ip]}
-          etcd_server_ip: {get_attr: [etcd_address_lb_switch, private_ip]}
+          kube_master_ip:
+            if:
+              - create_cluster_resources
+              - get_attr: [api_address_lb_switch, private_ip]
+              - get_param: existing_master_private_ip
+          etcd_server_ip:
+            if:
+              - create_cluster_resources
+              - get_attr: [etcd_address_lb_switch, private_ip]
+              - get_param: existing_master_private_ip
          external_network: {get_param: external_network}
          kube_allow_priv: {get_param: kube_allow_priv}
          boot_volume_size: {get_param: boot_volume_size}
@ -1092,7 +1171,11 @@ resources:
          kubernetes_port: {get_param: kubernetes_port}
          tls_disabled: {get_param: tls_disabled}
          verify_ca: {get_param: verify_ca}
-          secgroup_kube_minion_id: {get_resource: secgroup_kube_minion}
+          secgroup_kube_minion_id:
+            if:
+              - create_cluster_resources
+              - get_resource: secgroup_kube_minion
+              - get_param: existing_security_group
          http_proxy: {get_param: http_proxy}
          https_proxy: {get_param: https_proxy}
          no_proxy: {get_param: no_proxy}
@ -1123,6 +1206,7 @@ resources:
 outputs:

  api_address:
+    condition: create_cluster_resources
    value:
      str_replace:
        template: api_ip_address
@ -1133,6 +1217,7 @@ outputs:
      the Kubernetes API.

  registry_address:
+    condition: create_cluster_resources
    value:
      str_replace:
        template: localhost:port
@ -1143,22 +1228,26 @@ outputs:
      images.

  kube_masters_private:
+    condition: master_only
    value: {get_attr: [kube_masters, kube_master_ip]}
    description: >
      This is a list of the "private" IP addresses of all the Kubernetes masters.

  kube_masters:
+    condition: master_only
    value: {get_attr: [kube_masters, kube_master_external_ip]}
    description: >
      This is a list of the "public" IP addresses of all the Kubernetes masters.
      Use these IP addresses to log in to the Kubernetes masters via ssh.

  kube_minions_private:
+    condition: worker_only
    value: {get_attr: [kube_minions, kube_minion_ip]}
    description: >
      This is a list of the "private" IP addresses of all the Kubernetes minions.

  kube_minions:
+    condition: worker_only
    value: {get_attr: [kube_minions, kube_minion_external_ip]}
    description: >
      This is a list of the "public" IP addresses of all the Kubernetes minions.
--- a/magnum/drivers/mesos_ubuntu_v1/template_def.py
+++ b/magnum/drivers/mesos_ubuntu_v1/template_def.py
@ -101,8 +101,10 @@ class UbuntuMesosTemplateDefinition(template_def.BaseTemplateDefinition):
                      'mesos_slave_work_dir',
                      'mesos_slave_executor_env_variables']

+        labels = self._get_relevant_labels(cluster, kwargs)
+
        for label in label_list:
-            extra_params[label] = cluster.labels.get(label)
+            extra_params[label] = labels.get(label)

        return super(UbuntuMesosTemplateDefinition,
                     self).get_params(context, cluster_template, cluster,
--- a/magnum/tests/unit/conductor/handlers/test_cluster_conductor.py
+++ b/magnum/tests/unit/conductor/handlers/test_cluster_conductor.py
@ -470,7 +470,8 @@ class TestHandler(db_base.DbTestCase):
                                    timeout)

        mock_extract_tmpl_def.assert_called_once_with(self.context,
-                                                      cluster)
+                                                      cluster,
+                                                      nodegroups=None)
        mock_get_template_contents.assert_called_once_with(
            'the/template/path.yaml')
        mock_process_mult.assert_called_once_with(
@ -487,7 +488,8 @@ class TestHandler(db_base.DbTestCase):
                'file:///the/template/env_file_2':
                    'content of file:///the/template/env_file_2'
            },
-            parameters={'heat_param_1': 'foo', 'heat_param_2': 'bar'},
+            parameters={'is_cluster_stack': True, 'heat_param_1': 'foo',
+                        'heat_param_2': 'bar'},
            stack_name=('%s-short_id' % cluster.name),
            template='some template yaml',
            timeout_mins=timeout)
@ -543,6 +545,8 @@ class TestHandler(db_base.DbTestCase):
        osc = mock.MagicMock()
        mock_openstack_client_class.return_value = osc
        osc.heat.side_effect = exc.HTTPConflict
+        self.worker.create()
+        self.master.create()
        self.assertRaises(exception.OperationInProgress,
                          self.handler.cluster_delete,
                          self.context,
@ -570,6 +574,8 @@ class TestHandler(db_base.DbTestCase):
        mock_octavia.return_value = True
        mock_driver.return_value = k8s_atomic_dr.Driver()

+        self.master.create()
+        self.worker.create()
        self.handler.cluster_delete(self.context, self.cluster.uuid)

        notifications = fake_notifier.NOTIFICATIONS
--- a/magnum/tests/unit/conductor/handlers/test_k8s_cluster_conductor.py
+++ b/magnum/tests/unit/conductor/handlers/test_k8s_cluster_conductor.py
@ -1126,7 +1126,7 @@ class TestClusterConductorWithK8s(base.TestCase):

        expected_args = {
            'stack_name': expected_stack_name,
-            'parameters': {},
+            'parameters': {'is_cluster_stack': True},
            'template': expected_template_contents,
            'files': {},
            'environment_files': [],
@ -1166,7 +1166,7 @@ class TestClusterConductorWithK8s(base.TestCase):

        expected_args = {
            'stack_name': expected_stack_name,
-            'parameters': {},
+            'parameters': {'is_cluster_stack': True},
            'template': expected_template_contents,
            'files': {},
            'environment_files': [],
@ -1208,7 +1208,7 @@ class TestClusterConductorWithK8s(base.TestCase):

        expected_args = {
            'stack_name': expected_stack_name,
-            'parameters': {},
+            'parameters': {'is_cluster_stack': True},
            'template': expected_template_contents,
            'files': {},
            'environment_files': [],
--- a/magnum/tests/unit/drivers/test_heat_driver.py
+++ b/magnum/tests/unit/drivers/test_heat_driver.py
@ -13,6 +13,8 @@
 import mock
 from mock import patch

+from heatclient import exc as heatexc
+
 import magnum.conf
 from magnum.drivers.heat import driver as heat_driver
 from magnum.drivers.k8s_fedora_atomic_v1 import driver as k8s_atomic_dr
@ -26,25 +28,96 @@ CONF = magnum.conf.CONF

 class TestHeatPoller(base.TestCase):

+    def setUp(self):
+        super(TestHeatPoller, self).setUp()
+        self.mock_stacks = dict()
+        self.def_ngs = list()
+
+    def _create_nodegroup(self, cluster, uuid, stack_id, role=None,
+                          is_default=False, stack_status=None,
+                          status_reason=None, stack_params=None,
+                          stack_missing=False):
+        """Create a new nodegroup
+
+        Util that creates a new non-default ng, adds it to the cluster
+        and creates the corresponding mock stack.
+        """
+        role = 'worker' if role is None else role
+        ng = mock.MagicMock(uuid=uuid, role=role, is_default=is_default,
+                            stack_id=stack_id)
+
+        cluster.nodegroups.append(ng)
+
+        if stack_status is None:
+            stack_status = cluster_status.CREATE_COMPLETE
+
+        if status_reason is None:
+            status_reason = 'stack created'
+
+        stack_params = dict() if stack_params is None else stack_params
+
+        stack = mock.MagicMock(stack_status=stack_status,
+                               stack_status_reason=status_reason,
+                               parameters=stack_params)
+        # In order to simulate a stack not found from osc we don't add the
+        # stack in the dict.
+        if not stack_missing:
+            self.mock_stacks.update({stack_id: stack})
+        else:
+            # In case the stack is missing we need
+            # to set the status to the ng, so that
+            # _sync_missing_heat_stack knows which
+            # was the previous state.
+            ng.status = stack_status
+
+        return ng
+
    @patch('magnum.conductor.utils.retrieve_cluster_template')
    @patch('oslo_config.cfg')
    @patch('magnum.common.clients.OpenStackClients')
    @patch('magnum.drivers.common.driver.Driver.get_driver')
    def setup_poll_test(self, mock_driver, mock_openstack_client, cfg,
-                        mock_retrieve_cluster_template):
+                        mock_retrieve_cluster_template,
+                        default_stack_status=None, status_reason=None,
+                        stack_params=None, stack_missing=False):
        cfg.CONF.cluster_heat.max_attempts = 10

-        worker_ng = mock.MagicMock(uuid='worker_ng', role='worker')
-        master_ng = mock.MagicMock(uuid='master_ng', role='master')
-        nodegroups = [worker_ng, master_ng]
-        cluster = mock.MagicMock(nodegroups=nodegroups,
-                                 default_ng_worker=worker_ng,
-                                 default_ng_master=master_ng)
+        if default_stack_status is None:
+            default_stack_status = cluster_status.CREATE_COMPLETE
+
+        cluster = mock.MagicMock(nodegroups=list())
+
+        def_worker = self._create_nodegroup(cluster, 'worker_ng', 'stack1',
+                                            role='worker', is_default=True,
+                                            stack_status=default_stack_status,
+                                            status_reason=status_reason,
+                                            stack_params=stack_params,
+                                            stack_missing=stack_missing)
+        def_master = self._create_nodegroup(cluster, 'master_ng', 'stack1',
+                                            role='master', is_default=True,
+                                            stack_status=default_stack_status,
+                                            status_reason=status_reason,
+                                            stack_params=stack_params,
+                                            stack_missing=stack_missing)
+
+        cluster.default_ng_worker = def_worker
+        cluster.default_ng_master = def_master
+
+        self.def_ngs = [def_worker, def_master]
+
+        def get_ng_stack(stack_id, resolve_outputs=False):
+            try:
+                return self.mock_stacks[stack_id]
+            except KeyError:
+                # In this case we intentionally didn't add the stack
+                # to the mock_stacks dict to simulte a not found error.
+                # For this reason raise heat NotFound exception.
+                raise heatexc.NotFound("stack not found")
+
        cluster_template_dict = utils.get_test_cluster_template(
            coe='kubernetes')
-        mock_heat_stack = mock.MagicMock()
        mock_heat_client = mock.MagicMock()
-        mock_heat_client.stacks.get.return_value = mock_heat_stack
+        mock_heat_client.stacks.get = get_ng_stack
        mock_openstack_client.heat.return_value = mock_heat_client
        cluster_template = objects.ClusterTemplate(self.context,
                                                   **cluster_template_dict)
@ -54,174 +127,545 @@ class TestHeatPoller(base.TestCase):
                                        mock.MagicMock(), cluster,
                                        k8s_atomic_dr.Driver())
        poller.get_version_info = mock.MagicMock()
-        return (mock_heat_stack, cluster, poller)
+        return (cluster, poller)

-    def test_poll_no_save(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
+    def test_poll_and_check_creating(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.CREATE_IN_PROGRESS)

        cluster.status = cluster_status.CREATE_IN_PROGRESS
-        mock_heat_stack.stack_status = cluster_status.CREATE_IN_PROGRESS
        poller.poll_and_check()
-        self.assertEqual(0, cluster.save.call_count)

-    def test_poll_save(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.CREATE_IN_PROGRESS, ng.status)
+
+        self.assertEqual(cluster_status.CREATE_IN_PROGRESS, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_create_complete(self):
+        cluster, poller = self.setup_poll_test()

        cluster.status = cluster_status.CREATE_IN_PROGRESS
-        mock_heat_stack.stack_status = cluster_status.CREATE_FAILED
-        mock_heat_stack.stack_status_reason = 'Create failed'
-        self.assertIsNone(poller.poll_and_check())
+        poller.poll_and_check()

-        self.assertEqual(2, cluster.save.call_count)
-        self.assertEqual(cluster_status.CREATE_FAILED, cluster.status)
-        self.assertEqual('Create failed', cluster.status_reason)
-
-    @patch('os.path.join')
-    def test_poll_done(self, mock_join):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
-
-        mock_heat_stack.stack_status = cluster_status.DELETE_COMPLETE
-        self.assertIsNone(poller.poll_and_check())
-
-        mock_heat_stack.stack_status = cluster_status.CREATE_FAILED
-        self.assertIsNone(poller.poll_and_check())
-
-    def test_poll_done_by_update(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
-
-        mock_heat_stack.stack_status = cluster_status.UPDATE_COMPLETE
-        mock_heat_stack.parameters = {
-            'number_of_minions': 2,
-            'number_of_masters': 1
-        }
-        self.assertIsNone(poller.poll_and_check())
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, ng.status)
+            self.assertEqual('stack created', ng.status_reason)
+            self.assertEqual(1, ng.save.call_count)

+        self.assertEqual(cluster_status.CREATE_COMPLETE, cluster.status)
        self.assertEqual(1, cluster.save.call_count)
-        self.assertEqual(1, cluster.default_ng_worker.save.call_count)
-        self.assertEqual(1, cluster.default_ng_master.save.call_count)
-        self.assertEqual(cluster_status.UPDATE_COMPLETE, cluster.status)
-        self.assertEqual(2, cluster.default_ng_worker.node_count)
-        self.assertEqual(1, cluster.default_ng_master.node_count)

-    def test_poll_done_by_update_failed(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
+    def test_poll_and_check_create_failed(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.CREATE_FAILED)

-        mock_heat_stack.stack_status = cluster_status.UPDATE_FAILED
-        mock_heat_stack.parameters = {
+        cluster.status = cluster_status.CREATE_IN_PROGRESS
+        self.assertIsNone(poller.poll_and_check())
+
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.CREATE_FAILED, ng.status)
+            # Two calls to save since the stack ouptputs are synced too.
+            self.assertEqual(2, ng.save.call_count)
+
+        self.assertEqual(cluster_status.CREATE_FAILED, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_updating(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.UPDATE_IN_PROGRESS)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.UPDATE_IN_PROGRESS, ng.status)
+            self.assertEqual(1, ng.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_IN_PROGRESS, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_update_complete(self):
+        stack_params = {
            'number_of_minions': 2,
            'number_of_masters': 1
        }
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.UPDATE_COMPLETE,
+            stack_params=stack_params)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
        self.assertIsNone(poller.poll_and_check())

-        self.assertEqual(2, cluster.save.call_count)
-        self.assertEqual(cluster_status.UPDATE_FAILED, cluster.status)
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.UPDATE_COMPLETE, ng.status)
+
+        self.assertEqual(2, cluster.default_ng_worker.save.call_count)
+        self.assertEqual(2, cluster.default_ng_master.save.call_count)
        self.assertEqual(2, cluster.default_ng_worker.node_count)
        self.assertEqual(1, cluster.default_ng_master.node_count)

-    def test_poll_done_by_rollback_complete(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
+        self.assertEqual(cluster_status.UPDATE_COMPLETE, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)

-        mock_heat_stack.stack_status = cluster_status.ROLLBACK_COMPLETE
-        mock_heat_stack.parameters = {
+    def test_poll_and_check_update_failed(self):
+        stack_params = {
+            'number_of_minions': 2,
+            'number_of_masters': 1
+        }
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.UPDATE_FAILED,
+            stack_params=stack_params)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.UPDATE_FAILED, ng.status)
+            # We have several calls to save because the stack outputs are
+            # stored too.
+            self.assertEqual(3, ng.save.call_count)
+
+        self.assertEqual(2, cluster.default_ng_worker.node_count)
+        self.assertEqual(1, cluster.default_ng_master.node_count)
+
+        self.assertEqual(cluster_status.UPDATE_FAILED, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_deleting(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.DELETE_IN_PROGRESS)
+
+        cluster.status = cluster_status.DELETE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.DELETE_IN_PROGRESS, ng.status)
+            # We have two calls to save because the stack outputs are
+            # stored too.
+            self.assertEqual(1, ng.save.call_count)
+
+        self.assertEqual(cluster_status.DELETE_IN_PROGRESS, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_deleted(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.DELETE_COMPLETE)
+
+        cluster.status = cluster_status.DELETE_IN_PROGRESS
+        self.assertIsNone(poller.poll_and_check())
+
+        self.assertEqual(cluster_status.DELETE_COMPLETE,
+                         cluster.default_ng_worker.status)
+        self.assertEqual(1, cluster.default_ng_worker.save.call_count)
+        self.assertEqual(0, cluster.default_ng_worker.destroy.call_count)
+
+        self.assertEqual(cluster_status.DELETE_COMPLETE,
+                         cluster.default_ng_master.status)
+        self.assertEqual(1, cluster.default_ng_master.save.call_count)
+        self.assertEqual(0, cluster.default_ng_master.destroy.call_count)
+
+        self.assertEqual(cluster_status.DELETE_COMPLETE, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+        self.assertEqual(0, cluster.destroy.call_count)
+
+    def test_poll_and_check_delete_failed(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.DELETE_FAILED)
+
+        cluster.status = cluster_status.DELETE_IN_PROGRESS
+        poller.poll_and_check()
+
+        self.assertEqual(cluster_status.DELETE_FAILED,
+                         cluster.default_ng_worker.status)
+        # We have two calls to save because the stack outputs are
+        # stored too.
+        self.assertEqual(2, cluster.default_ng_worker.save.call_count)
+        self.assertEqual(0, cluster.default_ng_worker.destroy.call_count)
+
+        self.assertEqual(cluster_status.DELETE_FAILED,
+                         cluster.default_ng_master.status)
+        # We have two calls to save because the stack outputs are
+        # stored too.
+        self.assertEqual(2, cluster.default_ng_master.save.call_count)
+        self.assertEqual(0, cluster.default_ng_master.destroy.call_count)
+
+        self.assertEqual(cluster_status.DELETE_FAILED, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+        self.assertEqual(0, cluster.destroy.call_count)
+
+    def test_poll_done_rollback_complete(self):
+        stack_params = {
            'number_of_minions': 1,
            'number_of_masters': 1
        }
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.ROLLBACK_COMPLETE,
+            stack_params=stack_params)
+
        self.assertIsNone(poller.poll_and_check())

-        self.assertEqual(2, cluster.save.call_count)
+        self.assertEqual(1, cluster.save.call_count)
        self.assertEqual(cluster_status.ROLLBACK_COMPLETE, cluster.status)
        self.assertEqual(1, cluster.default_ng_worker.node_count)
        self.assertEqual(1, cluster.default_ng_master.node_count)

-    def test_poll_done_by_rollback_failed(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
-
-        mock_heat_stack.stack_status = cluster_status.ROLLBACK_FAILED
-        mock_heat_stack.parameters = {
+    def test_poll_done_rollback_failed(self):
+        stack_params = {
            'number_of_minions': 1,
            'number_of_masters': 1
        }
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.ROLLBACK_FAILED,
+            stack_params=stack_params)
+
        self.assertIsNone(poller.poll_and_check())

-        self.assertEqual(2, cluster.save.call_count)
+        self.assertEqual(1, cluster.save.call_count)
        self.assertEqual(cluster_status.ROLLBACK_FAILED, cluster.status)
        self.assertEqual(1, cluster.default_ng_worker.node_count)
        self.assertEqual(1, cluster.default_ng_master.node_count)

-    @patch('os.path.join')
-    def test_poll_destroy(self, mock_join):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
+    def test_poll_and_check_new_ng_creating(self):
+        cluster, poller = self.setup_poll_test()

-        mock_heat_stack.stack_status = cluster_status.DELETE_FAILED
-        self.assertIsNone(poller.poll_and_check())
-        # Destroy method is not called when stack delete failed
-        self.assertEqual(0, cluster.destroy.call_count)
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.CREATE_IN_PROGRESS)

-        mock_heat_stack.stack_status = cluster_status.DELETE_IN_PROGRESS
-        poller.poll_and_check()
-        self.assertEqual(0, cluster.destroy.call_count)
-        self.assertEqual(cluster_status.DELETE_IN_PROGRESS, cluster.status)
-
-        mock_heat_stack.stack_status = cluster_status.DELETE_COMPLETE
-        self.assertIsNone(poller.poll_and_check())
-        # destroy and notifications are handled up the stack now
-        self.assertEqual(cluster_status.DELETE_COMPLETE, cluster.status)
-
-    def test_poll_node_count(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
-
-        mock_heat_stack.parameters = {
-            'number_of_minions': 1,
-            'number_of_masters': 1
-        }
-        mock_heat_stack.stack_status = cluster_status.CREATE_IN_PROGRESS
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
        poller.poll_and_check()

-        self.assertEqual(1, cluster.default_ng_worker.node_count)
-        self.assertEqual(1, cluster.default_ng_master.node_count)
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)

-    def test_poll_node_count_by_update(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
+        self.assertEqual(cluster_status.CREATE_IN_PROGRESS, ng.status)
+        self.assertEqual(1, ng.save.call_count)
+        self.assertEqual(cluster_status.UPDATE_IN_PROGRESS, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)

-        mock_heat_stack.parameters = {
+    def test_poll_and_check_new_ng_created(self):
+        cluster, poller = self.setup_poll_test()
+
+        ng = self._create_nodegroup(cluster, 'ng1', 'stack2')
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.CREATE_COMPLETE, ng.status)
+        self.assertEqual(1, ng.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_COMPLETE, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_new_ng_create_failed(self):
+        cluster, poller = self.setup_poll_test()
+
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.CREATE_FAILED,
+            status_reason='stack failed')
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual('stack created', def_ng.status_reason)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.CREATE_FAILED, ng.status)
+        self.assertEqual('stack failed', ng.status_reason)
+        self.assertEqual(2, ng.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_FAILED, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_new_ng_updated(self):
+        cluster, poller = self.setup_poll_test()
+
+        stack_params = {'number_of_minions': 3}
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.UPDATE_COMPLETE,
+            stack_params=stack_params)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_COMPLETE, ng.status)
+        self.assertEqual(3, ng.node_count)
+        self.assertEqual(2, ng.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_COMPLETE, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_new_ng_update_failed(self):
+        cluster, poller = self.setup_poll_test()
+
+        stack_params = {'number_of_minions': 3}
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.UPDATE_FAILED,
+            stack_params=stack_params)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_FAILED, ng.status)
+        self.assertEqual(3, ng.node_count)
+        self.assertEqual(3, ng.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_FAILED, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_new_ng_deleting(self):
+        cluster, poller = self.setup_poll_test()
+
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.DELETE_IN_PROGRESS)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.DELETE_IN_PROGRESS, ng.status)
+        self.assertEqual(1, ng.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_IN_PROGRESS, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_new_ng_deleted(self):
+        cluster, poller = self.setup_poll_test()
+
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.DELETE_COMPLETE)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(1, ng.destroy.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_COMPLETE, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_new_ng_delete_failed(self):
+        cluster, poller = self.setup_poll_test()
+
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.DELETE_FAILED)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.DELETE_FAILED, ng.status)
+        self.assertEqual(2, ng.save.call_count)
+        self.assertEqual(0, ng.destroy.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_FAILED, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_new_ng_rollback_complete(self):
+        cluster, poller = self.setup_poll_test()
+
+        stack_params = {
            'number_of_minions': 2,
-            'number_of_masters': 3
+            'number_of_masters': 0
        }
-        mock_heat_stack.stack_status = cluster_status.UPDATE_COMPLETE
-        self.assertIsNone(poller.poll_and_check())
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.ROLLBACK_COMPLETE,
+            stack_params=stack_params)

-        self.assertEqual(2, cluster.default_ng_worker.node_count)
-        self.assertEqual(3, cluster.default_ng_master.node_count)
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.ROLLBACK_COMPLETE, ng.status)
+        self.assertEqual(2, ng.node_count)
+        self.assertEqual(3, ng.save.call_count)
+        self.assertEqual(0, ng.destroy.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_COMPLETE, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_new_ng_rollback_failed(self):
+        cluster, poller = self.setup_poll_test()
+
+        stack_params = {
+            'number_of_minions': 2,
+            'number_of_masters': 0
+        }
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.ROLLBACK_FAILED,
+            stack_params=stack_params)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.ROLLBACK_FAILED, ng.status)
+        self.assertEqual(2, ng.node_count)
+        self.assertEqual(3, ng.save.call_count)
+        self.assertEqual(0, ng.destroy.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_FAILED, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_multiple_new_ngs(self):
+        cluster, poller = self.setup_poll_test()
+
+        ng1 = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.CREATE_COMPLETE)
+        ng2 = self._create_nodegroup(
+            cluster, 'ng2', 'stack3',
+            stack_status=cluster_status.UPDATE_IN_PROGRESS)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.CREATE_COMPLETE, ng1.status)
+        self.assertEqual(1, ng1.save.call_count)
+        self.assertEqual(cluster_status.UPDATE_IN_PROGRESS, ng2.status)
+        self.assertEqual(1, ng2.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_IN_PROGRESS, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)
+
+    def test_poll_and_check_multiple_ngs_failed_and_updating(self):
+        cluster, poller = self.setup_poll_test()
+
+        ng1 = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.CREATE_FAILED)
+        ng2 = self._create_nodegroup(
+            cluster, 'ng2', 'stack3',
+            stack_status=cluster_status.UPDATE_IN_PROGRESS)
+
+        cluster.status = cluster_status.UPDATE_IN_PROGRESS
+        poller.poll_and_check()
+
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+            self.assertEqual(1, def_ng.save.call_count)
+
+        self.assertEqual(cluster_status.CREATE_FAILED, ng1.status)
+        self.assertEqual(2, ng1.save.call_count)
+        self.assertEqual(cluster_status.UPDATE_IN_PROGRESS, ng2.status)
+        self.assertEqual(1, ng2.save.call_count)
+
+        self.assertEqual(cluster_status.UPDATE_IN_PROGRESS, cluster.status)
+        self.assertEqual(1, cluster.save.call_count)

    @patch('magnum.drivers.heat.driver.trust_manager')
    @patch('magnum.drivers.heat.driver.cert_manager')
    def test_delete_complete(self, cert_manager, trust_manager):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
+        cluster, poller = self.setup_poll_test()
        poller._delete_complete()
        self.assertEqual(
            1, cert_manager.delete_certificates_from_cluster.call_count)
        self.assertEqual(1, trust_manager.delete_trustee_and_trust.call_count)

-    def test_create_or_complete(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
-        mock_heat_stack.stack_status = cluster_status.CREATE_COMPLETE
-        mock_heat_stack.stack_status_reason = 'stack complete'
-        poller._sync_cluster_and_template_status(mock_heat_stack)
-        self.assertEqual('stack complete', cluster.status_reason)
-        self.assertEqual(cluster_status.CREATE_COMPLETE, cluster.status)
-        self.assertEqual(1, cluster.save.call_count)
-
-    def test_sync_cluster_status(self):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
-        mock_heat_stack.stack_status = cluster_status.CREATE_IN_PROGRESS
-        mock_heat_stack.stack_status_reason = 'stack incomplete'
-        poller._sync_cluster_status(mock_heat_stack)
-        self.assertEqual('stack incomplete', cluster.status_reason)
-        self.assertEqual(cluster_status.CREATE_IN_PROGRESS, cluster.status)
-
    @patch('magnum.drivers.heat.driver.LOG')
-    def test_cluster_failed(self, logger):
-        mock_heat_stack, cluster, poller = self.setup_poll_test()
-        poller._sync_cluster_and_template_status(mock_heat_stack)
-        poller._cluster_failed(mock_heat_stack)
-        self.assertEqual(1, logger.error.call_count)
+    def test_nodegroup_failed(self, logger):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.CREATE_FAILED)
+
+        self._create_nodegroup(cluster, 'ng1', 'stack2',
+                               stack_status=cluster_status.CREATE_FAILED)
+        poller.poll_and_check()
+        # Verify that we have one log for each failed nodegroup
+        self.assertEqual(3, logger.error.call_count)
+
+    def test_stack_not_found_creating(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.CREATE_IN_PROGRESS,
+            stack_missing=True)
+        poller.poll_and_check()
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.CREATE_FAILED, ng.status)
+
+    def test_stack_not_found_updating(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.UPDATE_IN_PROGRESS,
+            stack_missing=True)
+        poller.poll_and_check()
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.UPDATE_FAILED, ng.status)
+
+    def test_stack_not_found_deleting(self):
+        cluster, poller = self.setup_poll_test(
+            default_stack_status=cluster_status.DELETE_IN_PROGRESS,
+            stack_missing=True)
+        poller.poll_and_check()
+        for ng in cluster.nodegroups:
+            self.assertEqual(cluster_status.DELETE_COMPLETE, ng.status)
+
+    def test_stack_not_found_new_ng_creating(self):
+        cluster, poller = self.setup_poll_test()
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.CREATE_IN_PROGRESS, stack_missing=True)
+        poller.poll_and_check()
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+        self.assertEqual(cluster_status.CREATE_FAILED, ng.status)
+
+    def test_stack_not_found_new_ng_updating(self):
+        cluster, poller = self.setup_poll_test()
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.UPDATE_IN_PROGRESS, stack_missing=True)
+        poller.poll_and_check()
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+        self.assertEqual(cluster_status.UPDATE_FAILED, ng.status)
+
+    def test_stack_not_found_new_ng_deleting(self):
+        cluster, poller = self.setup_poll_test()
+        ng = self._create_nodegroup(
+            cluster, 'ng1', 'stack2',
+            stack_status=cluster_status.DELETE_IN_PROGRESS, stack_missing=True)
+        poller.poll_and_check()
+        for def_ng in self.def_ngs:
+            self.assertEqual(cluster_status.CREATE_COMPLETE, def_ng.status)
+        self.assertEqual(cluster_status.DELETE_COMPLETE, ng.status)