magnum/magnum/drivers/cluster_api/driver.py

# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

import enum
import re

from oslo_log import log as logging
from oslo_utils import encodeutils

from magnum.api import utils as api_utils
from magnum.common import clients
from magnum.common import exception
from magnum.common import short_id
from magnum.common.x509 import operations as x509
from magnum.conductor.handlers.common import cert_manager
from magnum import conf
from magnum.drivers.cluster_api import app_creds
from magnum.drivers.cluster_api import helm
from magnum.drivers.cluster_api import kubernetes
from magnum.drivers.common import driver
from magnum.drivers.common import k8s_monitor
from magnum.objects import fields

LOG = logging.getLogger(__name__)
CONF = conf.CONF
NODE_GROUP_ROLE_CONTROLLER = "master"


class NodeGroupState(enum.Enum):
    NOT_PRESENT = 1
    PENDING = 2
    READY = 3
    FAILED = 4


class Driver(driver.Driver):
    def __init__(self):
        self._helm_client = helm.Client()
        self.__k8s_client = None

    @property
    def _k8s_client(self):
        if not self.__k8s_client:
            self.__k8s_client = kubernetes.Client.load()
        return self.__k8s_client

    @property
    def provides(self):
        return [
            {
                "server_type": "vm",
                # TODO(johngarbutt) OS list should probably come from config?
                "os": "ubuntu",
                "coe": "kubernetes",
            },
        ]

    def _update_control_plane_nodegroup_status(self, cluster, nodegroup):
        # The status of the master nodegroup is determined by the Cluster API
        # control plane object
        kcp = self._k8s_client.get_kubeadm_control_plane(
            self._sanitised_name(
                self._get_chart_release_name(cluster), "control-plane"
            ),
            self._namespace(cluster),
        )

        ng_state = NodeGroupState.NOT_PRESENT
        if kcp:
            ng_state = NodeGroupState.PENDING

        kcp_spec = kcp.get("spec", {}) if kcp else {}
        kcp_status = kcp.get("status", {}) if kcp else {}

        # The control plane object is what controls the Kubernetes version
        # If it is known, report it
        kube_version = kcp_status.get("version", kcp_spec.get("version"))
        if cluster.coe_version != kube_version:
            cluster.coe_version = kube_version
            cluster.save()

        kcp_true_conditions = {
            cond["type"]
            for cond in kcp_status.get("conditions", [])
            if cond["status"] == "True"
        }
        kcp_ready = all(
            cond in kcp_true_conditions
            for cond in (
                "MachinesReady",
                "Ready",
                "EtcdClusterHealthy",
                "ControlPlaneComponentsHealthy"
            )
        )
        target_replicas = kcp_spec.get("replicas")
        current_replicas = kcp_status.get("replicas")
        updated_replicas = kcp_status.get("updatedReplicas")
        ready_replicas = kcp_status.get("readyReplicas")
        if (
            kcp_ready and
            target_replicas == current_replicas and
            current_replicas == updated_replicas and
            updated_replicas == ready_replicas
        ):
            ng_state = NodeGroupState.READY

        # TODO(mkjpryor) Work out a way to determine FAILED state
        return self._update_nodegroup_status(cluster, nodegroup, ng_state)

    def _update_worker_nodegroup_status(self, cluster, nodegroup):
        # The status of a worker nodegroup is determined by the corresponding
        # Cluster API machine deployment
        md = self._k8s_client.get_machine_deployment(
            self._sanitised_name(
                self._get_chart_release_name(cluster), nodegroup.name
            ),
            self._namespace(cluster),
        )

        ng_state = NodeGroupState.NOT_PRESENT
        if md:
            ng_state = NodeGroupState.PENDING

        # When a machine deployment is deleted, it disappears straight
        # away even when there are still machines belonging to it that
        # are deleting
        # In that case, we want to keep the nodegroup as DELETE_IN_PROGRESS
        # until all the machines for the node group are gone
        if (
            not md
            and nodegroup.status.startswith("DELETE_")
            and self._nodegroup_machines_exist(cluster, nodegroup)
        ):
            LOG.debug(
                f"Node group {nodegroup.name} "
                f"for cluster {cluster.uuid} "
                "machine deployment gone, but machines still found."
            )
            ng_state = NodeGroupState.PENDING

        md_status = md.get("status", {}) if md else {}
        md_phase = md_status.get("phase")
        if md_phase:
            if md_phase == "Running":
                ng_state = NodeGroupState.READY
            elif md_phase in {"Failed", "Unknown"}:
                ng_state = NodeGroupState.FAILED

        return self._update_nodegroup_status(cluster, nodegroup, ng_state)

    def _update_nodegroup_status(self, cluster, nodegroup, ng_state):
        # For delete we are waiting for not present
        if nodegroup.status.startswith("DELETE_"):
            if ng_state == NodeGroupState.NOT_PRESENT:
                if not nodegroup.is_default:
                    # Conductor will delete default nodegroups
                    # when cluster is deleted, but non default
                    # node groups should be deleted here.
                    nodegroup.destroy()
                LOG.debug(
                    f"Node group deleted: {nodegroup.name} "
                    f"for cluster {cluster.uuid} "
                    f"which is_default: {nodegroup.is_default}"
                )
                # signal the node group has been deleted
                return None

            LOG.debug(
                f"Node group not yet delete: {nodegroup.name} "
                f"for cluster {cluster.uuid}"
            )
            return nodegroup

        is_update_operation = nodegroup.status.startswith("UPDATE_")
        is_create_operation = nodegroup.status.startswith("CREATE_")
        if not is_update_operation and not is_create_operation:
            LOG.warning(
                f"Node group: {nodegroup.name} in unexpected "
                f"state: {nodegroup.status} in cluster {cluster.uuid}"
            )
        elif ng_state == NodeGroupState.READY:
            nodegroup.status = (
                fields.ClusterStatus.UPDATE_COMPLETE
                if is_update_operation
                else fields.ClusterStatus.CREATE_COMPLETE
            )
            LOG.debug(
                f"Node group ready: {nodegroup.name} "
                f"in cluster {cluster.uuid}"
            )
            nodegroup.save()

        elif ng_state == NodeGroupState.FAILED:
            nodegroup.status = (
                fields.ClusterStatus.UPDATE_FAILED
                if is_update_operation
                else fields.ClusterStatus.CREATE_FAILED
            )
            LOG.debug(
                f"Node group failed: {nodegroup.name} "
                f"in cluster {cluster.uuid}"
            )
            nodegroup.save()
        elif ng_state == NodeGroupState.NOT_PRESENT:
            LOG.debug(
                f"Node group not yet found: {nodegroup.name} "
                f"state:{nodegroup.status} in cluster {cluster.uuid}"
            )
        else:
            LOG.debug(
                f"Node group still pending: {nodegroup.name} "
                f"state:{nodegroup.status} in cluster {cluster.uuid}"
            )

        return nodegroup

    def _nodegroup_machines_exist(self, cluster, nodegroup):
        cluster_name = self._get_chart_release_name(cluster)
        nodegroup_name = self._sanitised_name(nodegroup.name)
        machines = self._k8s_client.get_all_machines_by_label(
            {
                "capi.stackhpc.com/cluster": cluster_name,
                "capi.stackhpc.com/component": "worker",
                "capi.stackhpc.com/node-group": nodegroup_name,
            },
            self._namespace(cluster),
        )
        return bool(machines)

    def _update_cluster_api_address(self, cluster, capi_cluster):
        # As soon as we know the API address, we should set it
        # This means users can access the API even if the create is
        # not complete, which could be useful for debugging failures,
        # e.g. with addons
        if not capi_cluster:
            # skip update if cluster not yet created
            return

        if cluster.status not in [
            fields.ClusterStatus.CREATE_IN_PROGRESS,
            fields.ClusterStatus.UPDATE_IN_PROGRESS,
        ]:
            # only update api-address when updating or creating
            return

        api_endpoint = capi_cluster["spec"].get("controlPlaneEndpoint")
        if api_endpoint:
            api_address = (
                f"https://{api_endpoint['host']}:{api_endpoint['port']}"
            )
            if cluster.api_address != api_address:
                cluster.api_address = api_address
                cluster.save()
                LOG.debug(f"Found api_address for {cluster.uuid}")

    def _update_status_updating(self, cluster, capi_cluster):
        # If the cluster is not yet ready then the create/update
        # is still in progress
        true_conditions = {
            cond["type"]
            for cond in capi_cluster.get("status", {}).get("conditions", [])
            if cond["status"] == "True"
        }
        for cond in ("InfrastructureReady", "ControlPlaneReady", "Ready"):
            if cond not in true_conditions:
                return

        is_update_operation = cluster.status.startswith("UPDATE_")

        # Check the status of the addons
        addons = self._k8s_client.get_addons_by_label(
            {
                "addons.stackhpc.com/cluster": self._sanitised_name(
                    self._get_chart_release_name(cluster)
                ),
            },
            self._namespace(cluster)
        )
        for addon in addons:
            addon_phase = addon.get("status", {}).get("phase")
            if addon_phase and addon_phase in {"Failed", "Unknown"}:
                # If the addon is failed, mark the cluster as failed
                cluster.status = (
                    fields.ClusterStatus.UPDATE_FAILED
                    if is_update_operation
                    else fields.ClusterStatus.CREATE_FAILED
                )
                cluster.save()
                return
            elif addon_phase and addon_phase == "Deployed":
                # If the addon is deployed, move on to the next one
                continue
            else:
                # If there are any addons that are not deployed or failed,
                # wait for the next invocation to check again
                LOG.debug(
                    f"addon {addon['metadata']['name']} not yet deployed "
                    f"for {cluster.uuid}"
                )
                return

        # If we get this far, the cluster has completed successfully
        cluster.status = (
            fields.ClusterStatus.UPDATE_COMPLETE
            if is_update_operation
            else fields.ClusterStatus.CREATE_COMPLETE
        )
        cluster.save()

    def _update_status_deleting(self, context, cluster):
        # Once the Cluster API cluster is gone, we need to clean up
        # the secrets we created
        self._k8s_client.delete_all_secrets_by_label(
            "magnum.openstack.org/cluster-uuid",
            cluster.uuid,
            self._namespace(cluster),
        )

        # We also need to clean up the appcred that we made
        app_creds.delete_app_cred(context, cluster)

        cluster.status = fields.ClusterStatus.DELETE_COMPLETE
        cluster.save()

    def _get_capi_cluster(self, cluster):
        return self._k8s_client.get_capi_cluster(
            self._sanitised_name(self._get_chart_release_name(cluster)),
            self._namespace(cluster),
        )

    def _update_all_nodegroups_status(self, cluster):
        """Returns True if any node group still in progress."""
        nodegroups = []
        for nodegroup in cluster.nodegroups:
            if nodegroup.role == NODE_GROUP_ROLE_CONTROLLER:
                updated_nodegroup = (
                    self._update_control_plane_nodegroup_status(
                        cluster, nodegroup
                    )
                )
            else:
                updated_nodegroup = self._update_worker_nodegroup_status(
                    cluster, nodegroup
                )
            if updated_nodegroup:
                nodegroups.append(updated_nodegroup)

        # Return True if any are still in progress
        for nodegroup in nodegroups:
            if nodegroup.status.endswith("_IN_PROGRESS"):
                return True
        return False

    def update_cluster_status(self, context, cluster):
        # NOTE(mkjpryor)
        # Because Kubernetes operators are built around reconciliation loops,
        # Cluster API clusters don't really go into an error state
        # Hence we only currently handle transitioning from IN_PROGRESS
        # states to COMPLETE

        # TODO(mkjpryor) Add a timeout for create/update/delete

        # Update the cluster API address if it is known
        # so users can get their coe credentials
        capi_cluster = self._get_capi_cluster(cluster)
        self._update_cluster_api_address(cluster, capi_cluster)

        # Update the nodegroups first
        # to ensure API never returns an inconsistent state
        nodegroups_in_progress = self._update_all_nodegroups_status(cluster)

        if cluster.status in {
            fields.ClusterStatus.CREATE_IN_PROGRESS,
            fields.ClusterStatus.UPDATE_IN_PROGRESS,
        }:
            LOG.debug("Checking on an update for %s", cluster.uuid)
            # If the cluster does not exist yet,
            # create is still in progress
            if not capi_cluster:
                LOG.debug(f"capi_cluster not yet created for {cluster.uuid}")
                return
            if nodegroups_in_progress:
                LOG.debug(f"Node groups are not all ready for {cluster.uuid}")
                return
            self._update_status_updating(cluster, capi_cluster)

        elif cluster.status == fields.ClusterStatus.DELETE_IN_PROGRESS:
            LOG.debug("Checking on a delete for %s", cluster.uuid)
            # If the Cluster API cluster still exists,
            # the delete is still in progress
            if capi_cluster:
                LOG.debug(f"capi_cluster still found for {cluster.uuid}")
                return
            self._update_status_deleting(context, cluster)

    def get_monitor(self, context, cluster):
        return k8s_monitor.K8sMonitor(context, cluster)

    def _namespace(self, cluster):
        # We create clusters in a project-specific namespace
        # To generate the namespace, first sanitize the project id
        project_id = re.sub("[^a-z0-9]", "", cluster.project_id.lower())
        suffix = CONF.capi_driver.magnum_namespace_suffix
        return f"{suffix}-{project_id}"

    def _label(self, cluster, key, default):
        all_labels = helm.mergeconcat(
            cluster.cluster_template.labels, cluster.labels
        )
        if not all_labels:
            return default
        raw = all_labels.get(key, default)
        # NOTE(johngarbutt): filtering untrusted user input
        return re.sub(r"[^a-z0-9\.\-\/]+", "", raw)

    def _get_chart_version(self, cluster):
        version = cluster.cluster_template.labels.get(
            "capi_helm_chart_version", CONF.capi_driver.helm_chart_version
        )
        # NOTE(johngarbutt): filtering untrusted user input
        return re.sub(r"[^a-z0-9\.\-]+", "", version)

    def _sanitised_name(self, name, suffix=None):
        return re.sub(
            "[^a-z0-9]+",
            "-",
            (f"{name}-{suffix}" if suffix else name).lower(),
        )

    def _get_kube_version(self, image):
        # The image should have a property containing the Kubernetes version
        kube_version = image.get("kube_version")
        if not kube_version:
            raise exception.KubeVersionPropertyNotFound(image_id=image.id)
        return kube_version.lstrip("v")

    def _get_image_details(self, context, image_identifier):
        osc = clients.OpenStackClients(context)
        image = api_utils.get_openstack_resource(
            osc.glance().images, image_identifier, "images"
        )
        return image.id, self._get_kube_version(image)

    def _get_app_cred_name(self, cluster):
        return self._sanitised_name(
            self._get_chart_release_name(cluster), "cloud-credentials"
        )

    def _get_monitoring_enabled(self, cluster):
        mon_label = self._label(cluster, "monitoring_enabled", "")
        # NOTE(mkjpryor) default of, like heat driver,
        # as requires cinder and takes a while
        return mon_label == "true"

    def _get_kube_dash_enabled(self, cluster):
        kube_dash_label = self._label(cluster, "kube_dashboard_enabled", "")
        # NOTE(mkjpryor) default on, like the heat driver
        return kube_dash_label != "false"

    def _update_helm_release(self, context, cluster, nodegroups=None):
        if nodegroups is None:
            nodegroups = cluster.nodegroups
        cluster_template = cluster.cluster_template
        image_id, kube_version = self._get_image_details(
            context, cluster_template.image_id
        )
        values = {
            "kubernetesVersion": kube_version,
            "machineImageId": image_id,
            "cloudCredentialsSecretName": self._get_app_cred_name(cluster),
            # TODO(johngarbutt): need to respect requested networks
            "clusterNetworking": {
                "internalNetwork": {
                    "nodeCidr": self._label(
                        cluster, "fixed_subnet_cidr", "10.0.0.0/24"
                    ),
                }
            },
            "apiServer": {
                "enableLoadBalancer": True,
                "loadBalancerProvider": self._label(
                    cluster, "octavia_provider", "amphora"
                ),
            },
            "controlPlane": {
                "machineFlavor": cluster.master_flavor_id,
                "machineCount": cluster.master_count,
            },
            "addons": {
                "monitoring": {
                    "enabled": self._get_monitoring_enabled(cluster)
                },
                "kubernetesDashboard": {
                    "enabled": self._get_kube_dash_enabled(cluster)
                },
                # TODO(mkjpryor): can't enable ingress until code exists to
                #                 remove the load balancer
                "ingress": {"enabled": False},
            },
            "nodeGroups": [
                {
                    "name": self._sanitised_name(ng.name),
                    "machineFlavor": ng.flavor_id,
                    "machineCount": ng.node_count,
                }
                for ng in nodegroups
                if ng.role != NODE_GROUP_ROLE_CONTROLLER
            ],
        }

        if cluster_template.dns_nameserver:
            dns_nameservers = cluster_template.dns_nameserver.split(",")
            values["clusterNetworking"]["dnsNameservers"] = dns_nameservers

        if cluster.keypair:
            values["machineSSHKeyName"] = cluster.keypair

        chart_version = self._get_chart_version(cluster)

        self._helm_client.install_or_upgrade(
            self._get_chart_release_name(cluster),
            CONF.capi_driver.helm_chart_name,
            values,
            repo=CONF.capi_driver.helm_chart_repo,
            version=chart_version,
            namespace=self._namespace(cluster),
        )

    def _generate_release_name(self, cluster):
        if cluster.stack_id:
            return

        # Make sure no duplicate names
        # by generating 12 character random id
        random_bit = short_id.generate_id()
        base_name = self._sanitised_name(cluster.name)
        # valid release names are 53 chars long
        # and stack_id is 12 characters
        # but we also use this to derive hostnames
        trimmed_name = base_name[:30]
        # Save the full name, so users can rename in the API
        cluster.stack_id = f"{trimmed_name}-{random_bit}".lower()
        # be sure to save this before we use it
        cluster.save()

    def _get_chart_release_name(self, cluster):
        return cluster.stack_id

    def _k8s_resource_labels(self, cluster):
        return {
            "magnum.openstack.org/project-id": cluster.project_id,
            "magnum.openstack.org/user-id": cluster.user_id,
            "magnum.openstack.org/cluster-uuid": cluster.uuid,
        }

    def _create_appcred_secret(self, context, cluster):
        ca_certificate = app_creds.get_openstack_ca_certificate()
        appcred_yaml = app_creds.get_app_cred_yaml(context, cluster)
        name = self._get_app_cred_name(cluster)
        self._k8s_client.apply_secret(
            name,
            {
                "metadata": {"labels": self._k8s_resource_labels(cluster)},
                "stringData": {
                    "cacert": ca_certificate,
                    "clouds.yaml": appcred_yaml,
                },
            },
            self._namespace(cluster),
        )

    def _decode_cert(self, cert):
        return encodeutils.safe_decode(cert.get_certificate())

    def _decode_key(self, cert):
        key = x509.decrypt_key(
            cert.get_private_key(),
            cert.get_private_key_passphrase(),
        )
        return encodeutils.safe_decode(key)

    def _ensure_certificate_secrets(self, context, cluster):
        # Magnum creates CA certs for each of the Kubernetes components that
        # must be trusted by the cluster
        # In particular, this is required for "openstack coe cluster config"
        # to work, as that doesn't communicate with the driver and instead
        # relies on the correct CA being trusted by the cluster

        # Cluster API looks for specific named secrets for each of the CAs,
        # and generates them if they don't exist, so we create them here
        # with the correct certificates in
        certificates = {
            "ca": cert_manager.get_cluster_ca_certificate(cluster, context),
            "etcd": cert_manager.get_cluster_ca_certificate(
                cluster, context, "etcd"
            ),
            "proxy": cert_manager.get_cluster_ca_certificate(
                cluster, context, "front_proxy"
            ),
            "sa": cert_manager.get_cluster_magnum_cert(cluster, context),
        }
        for name, cert in certificates.items():
            self._k8s_client.apply_secret(
                self._sanitised_name(
                    self._get_chart_release_name(cluster), name
                ),
                {
                    "metadata": {"labels": self._k8s_resource_labels(cluster)},
                    "type": "cluster.x-k8s.io/secret",
                    "stringData": {
                        "tls.crt": self._decode_cert(cert),
                        "tls.key": self._decode_key(cert),
                    },
                },
                self._namespace(cluster),
            )

    def create_cluster(self, context, cluster, cluster_create_timeout):
        LOG.info("Starting to create cluster %s", cluster.uuid)

        # we generate this name (on the initial create call only)
        # so we hit no issues with duplicate cluster names
        # and it makes renaming clusters in the API possible
        self._generate_release_name(cluster)

        # NOTE(johngarbutt) all node groups should already
        # be in the CREATE_IN_PROGRESS state
        self._k8s_client.ensure_namespace(self._namespace(cluster))
        self._create_appcred_secret(context, cluster)
        self._ensure_certificate_secrets(context, cluster)

        self._update_helm_release(context, cluster)

    def update_cluster(
        self, context, cluster, scale_manager=None, rollback=False
    ):
        # Cluster API refuses to update things like cluster networking,
        # so it is safest not to implement this for now
        # TODO(mkjpryor) Check what bits of update we can support
        raise NotImplementedError(
            "Updating a cluster in this way is not currently supported"
        )

    def delete_cluster(self, context, cluster):
        LOG.info("Starting to delete cluster %s", cluster.uuid)

        # Copy the helm driver by marking all node groups
        # as delete in progress here, as note done by conductor
        # We do this before calling uninstall_release because
        # update_cluster_status can get called before we return
        for ng in cluster.nodegroups:
            ng.status = fields.ClusterStatus.DELETE_IN_PROGRESS
            ng.save()

        # Begin the deletion of the cluster resources by uninstalling the
        # Helm release
        # Note that this just marks the resources for deletion - it does not
        # wait for the resources to be deleted
        self._helm_client.uninstall_release(
            self._get_chart_release_name(cluster),
            namespace=self._namespace(cluster),
        )

    def resize_cluster(
        self,
        context,
        cluster,
        resize_manager,
        node_count,
        nodes_to_remove,
        nodegroup=None,
    ):
        if nodes_to_remove:
            LOG.warning("Removing specific nodes is not currently supported")
        self._update_helm_release(context, cluster)

    def upgrade_cluster(
        self,
        context,
        cluster,
        cluster_template,
        max_batch_size,
        nodegroup,
        scale_manager=None,
        rollback=False,
    ):
        # TODO(mkjpryor) check that the upgrade is viable
        # e.g. not a downgrade, not an upgrade by more than one minor version

        # Updating the template will likely apply for all nodegroups
        # So mark them all as having an update in progress
        for nodegroup in cluster.nodegroups:
            nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
            nodegroup.save()

        # Move the cluster to the new template
        cluster.cluster_template_id = cluster_template.uuid
        cluster.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
        cluster.save()
        cluster.refresh()

        self._update_helm_release(context, cluster)

    def create_nodegroup(self, context, cluster, nodegroup):
        nodegroup.status = fields.ClusterStatus.CREATE_IN_PROGRESS
        nodegroup.save()

        self._update_helm_release(context, cluster)

    def update_nodegroup(self, context, cluster, nodegroup):
        nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
        nodegroup.save()

        self._update_helm_release(context, cluster)

    def delete_nodegroup(self, context, cluster, nodegroup):
        nodegroup.status = fields.ClusterStatus.DELETE_IN_PROGRESS
        nodegroup.save()

        # Remove the nodegroup being deleted from the nodegroups
        # for the Helm release
        self._update_helm_release(
            context,
            cluster,
            [ng for ng in cluster.nodegroups if ng.name != nodegroup.name]
        )

    def create_federation(self, context, federation):
        raise NotImplementedError("Will not implement 'create_federation'")

    def update_federation(self, context, federation):
        raise NotImplementedError("Will not implement 'update_federation'")

    def delete_federation(self, context, federation):
        raise NotImplementedError("Will not implement 'delete_federation'")