Allow autoscaling of non-default node groups

This change allows min/max node counts for all node groups to be passed through to the Cluster API Helm values so that the autoscaler will handle them correctly. It also adds some additional validation of node counts during node group creation. Change-Id: I3f6ea5faefdd9affa2aad89dda7d70bbc890f970
2024-05-20 10:38:57 +01:00 · 2024-05-20 10:38:57 +01:00 · 2300100462
parent f5881c9388
commit 2300100462
3 changed files with 243 additions and 31 deletions
--- a/magnum_capi_helm/driver.py
+++ b/magnum_capi_helm/driver.py
@ -578,30 +578,20 @@ class Driver(driver.Driver):
    def _get_autoheal_enabled(self, cluster):
        return self._get_label_bool(cluster, "auto_healing_enabled", True)

-    def _get_autoscale(self, cluster, nodegroup):
-        auto_scale = self._get_label_bool(
-            cluster, "auto_scaling_enabled", False
+    def _get_autoscale_enabled(self, cluster):
+        return self._get_label_bool(cluster, "auto_scaling_enabled", False)
+
+    def _get_autoscale_values(self, cluster, nodegroup):
+        auto_scale = self._get_autoscale_enabled(cluster)
+        min_nodes, max_nodes = self._validate_allowed_node_counts(
+            cluster, nodegroup
        )
-        if auto_scale:
-            auto_scale_args = dict(autoscale="true")
-            min_nodes = max(1, nodegroup.min_node_count)
-            max_nodes = self._get_label_int(
-                cluster, "max_node_count", min_nodes
-            )
-            if min_nodes > nodegroup.node_count:
-                raise exception.MagnumException(
-                    message="min_node_count must be less than or equal to "
-                    "default-worker nodegroup node_count."
-                )
+        auto_scale_args = {}
+        if auto_scale and max_nodes is not None:
+            auto_scale_args["autoscale"] = "true"
            auto_scale_args["machineCountMin"] = min_nodes
-            if max_nodes < min_nodes:
-                raise exception.MagnumException(
-                    message="max_node_count must be greater than or "
-                    "equal to min_node_count"
-                )
            auto_scale_args["machineCountMax"] = max_nodes
-            return auto_scale_args
-        return auto_scale
+        return auto_scale_args

    def _get_k8s_keystone_auth_enabled(self, cluster):
        return self._get_label_bool(cluster, "keystone_auth_enabled", False)
@ -645,6 +635,58 @@ class Driver(driver.Driver):
            f"Minimum {CONF.capi_helm.minimum_flavor_ram} MB required."
        )

+    def _is_default_worker_nodegroup(self, cluster, nodegroup):
+        return cluster.default_ng_worker.id == nodegroup.id
+
+    def _get_node_counts(self, cluster, nodegroup):
+
+        # ClusterAPI provider OpenStack (CAPO) doesn't
+        # support scale to zero yet
+        min_nodes = max(1, nodegroup.min_node_count)
+        max_nodes = nodegroup.max_node_count
+
+        # If min/max node counts are not defined on the default
+        # worker group then fall back to equivalent cluster labels
+        if self._is_default_worker_nodegroup(cluster, nodegroup):
+            # Magnum seems to set min_node_count = 1 on default group
+            # but we still want to be able to override that with labels
+            if min_nodes is None or min_nodes == 1:
+                min_nodes = self._get_label_int(cluster, "min_node_count", 1)
+            if not max_nodes:
+                max_nodes = self._get_label_int(
+                    cluster, "max_node_count", min_nodes
+                )
+
+        return min_nodes, max_nodes
+
+    def _validate_allowed_node_counts(self, cluster, nodegroup):
+        min_nodes, max_nodes = self._get_node_counts(cluster, nodegroup)
+
+        LOG.debug(
+            f"Checking if node group {nodegroup.name} has valid "
+            f"node count parameters (count, min, max) = "
+            f"{(nodegroup.node_count, min_nodes, max_nodes)}"
+        )
+
+        if min_nodes is not None:
+            if min_nodes < 1:
+                raise exception.NodeGroupInvalidInput(
+                    message="Min node count must be greater than "
+                    "or equal to 1 for all node groups."
+                )
+            if min_nodes > nodegroup.node_count:
+                raise exception.NodeGroupInvalidInput(
+                    message="Min node count must be less than "
+                    "or equal to current node count"
+                )
+            if max_nodes is not None and max_nodes < min_nodes:
+                raise exception.NodeGroupInvalidInput(
+                    message="Max node count must be greater than "
+                    "or equal to min node count"
+                )
+
+        return min_nodes, max_nodes
+
    def _get_csi_cinder_availability_zone(self, cluster):
        return self._label(
            cluster,
@ -757,13 +799,9 @@ class Driver(driver.Driver):
                    machineFlavor=ng.flavor_id,
                    machineCount=ng.node_count,
                )
-                # Assume first nodegroup is default-worker.
-                if not nodegroup_set:
-                    auto_scale = self._get_autoscale(cluster, ng)
-                    if auto_scale:
-                        nodegroup_item = helm.mergeconcat(
-                            nodegroup_item, auto_scale
-                        )
+                if self._get_autoscale_enabled(cluster):
+                    values = self._get_autoscale_values(cluster, ng)
+                    nodegroup_item = helm.mergeconcat(nodegroup_item, values)
                nodegroup_set.append(nodegroup_item)
        return nodegroup_set

@ -1037,6 +1075,8 @@ class Driver(driver.Driver):
    def create_nodegroup(self, context, cluster, nodegroup):
        nodegroup.status = fields.ClusterStatus.CREATE_IN_PROGRESS
        self._validate_allowed_flavor(context, nodegroup.flavor_id)
+        if self._get_autoscale_enabled(cluster):
+            self._validate_allowed_node_counts(cluster, nodegroup)
        nodegroup.save()

        self._update_helm_release(context, cluster)
@ -1044,6 +1084,8 @@ class Driver(driver.Driver):
    def update_nodegroup(self, context, cluster, nodegroup):
        nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
        self._validate_allowed_flavor(context, nodegroup.flavor_id)
+        if self._get_autoscale_enabled(cluster):
+            self._validate_allowed_node_counts(cluster, nodegroup)
        nodegroup.save()

        self._update_helm_release(context, cluster)
--- a/magnum_capi_helm/tests/test_driver.py
+++ b/magnum_capi_helm/tests/test_driver.py
@ -10,6 +10,7 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 from unittest import mock
+from uuid import uuid4

 from magnum.common import exception
 from magnum.common import neutron
@ -2702,7 +2703,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
        )
        self.assertRaises(
            exception.MagnumException,
-            self.driver._get_autoscale,
+            self.driver._get_autoscale_values,
            self.cluster_obj,
            self.cluster_obj.nodegroups[0],
        )
@ -2755,10 +2756,9 @@ class ClusterAPIDriverTest(base.DbTestCase):
            helm_install_values["nodeGroups"][0]["autoscale"],
            auto_scale_labels["auto_scaling_enabled"],
        )
-        # min_node_count is hardcode to max(1, ng.min_node_count)
        self.assertEqual(
            helm_install_values["nodeGroups"][0]["machineCountMin"],
-            self.cluster_obj.nodegroups[0].min_node_count,
+            auto_scale_labels["min_node_count"],
        )
        self.assertEqual(
            helm_install_values["nodeGroups"][0]["machineCountMax"],
@ -2821,3 +2821,163 @@ class ClusterAPIDriverTest(base.DbTestCase):
            "machineCountMax",
            helm_install_values["nodeGroups"][0],
        )
+
+    @mock.patch.object(
+        driver.Driver,
+        "_storageclass_definitions",
+        return_value=mock.ANY,
+    )
+    @mock.patch.object(driver.Driver, "_validate_allowed_flavor")
+    @mock.patch.object(neutron, "get_network", autospec=True)
+    @mock.patch.object(
+        driver.Driver, "_ensure_certificate_secrets", autospec=True
+    )
+    @mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
+    @mock.patch.object(kubernetes.Client, "load", autospec=True)
+    @mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
+    @mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
+    def test_nodegroup_node_count_validation(
+        self,
+        mock_install,
+        mock_image,
+        mock_load,
+        mock_appcred,
+        mock_certs,
+        mock_get_net,
+        mock_validate_allowed_flavor,
+        mock_storageclasses,
+    ):
+        auto_scale_labels = dict(
+            auto_scaling_enabled="true", min_node_count=2, max_node_count=6
+        )
+        self.cluster_obj.labels = auto_scale_labels
+
+        mock_image.return_value = (
+            "imageid1",
+            "1.27.4",
+            "ubuntu",
+        )
+
+        # Create cluster with extra node groups
+        ng = obj_utils.create_test_nodegroup(
+            self.context, min_node_count=3, max_node_count=2
+        )
+        self.cluster_obj.nodegroups.append(ng)
+        self.assertRaises(
+            exception.NodeGroupInvalidInput,
+            self.driver.create_cluster,
+            self.context,
+            self.cluster_obj,
+            "not-used",
+        )
+
+    @mock.patch.object(
+        driver.Driver,
+        "_storageclass_definitions",
+        return_value=mock.ANY,
+    )
+    @mock.patch.object(driver.Driver, "_validate_allowed_flavor")
+    @mock.patch.object(neutron, "get_network", autospec=True)
+    @mock.patch.object(
+        driver.Driver, "_ensure_certificate_secrets", autospec=True
+    )
+    @mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
+    @mock.patch.object(kubernetes.Client, "load", autospec=True)
+    @mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
+    @mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
+    def test_create_extra_nodegroup_auto_scale_values(
+        self,
+        mock_install,
+        mock_image,
+        mock_load,
+        mock_appcred,
+        mock_certs,
+        mock_get_net,
+        mock_validate_allowed_flavor,
+        mock_storageclasses,
+    ):
+        auto_scale_labels = dict(
+            auto_scaling_enabled="true", min_node_count=2, max_node_count=6
+        )
+        self.cluster_obj.labels = auto_scale_labels
+
+        mock_image.return_value = (
+            "imageid1",
+            "1.27.4",
+            "ubuntu",
+        )
+
+        # Create cluster with extra node groups
+        non_auto_scale_nodegroup = obj_utils.create_test_nodegroup(
+            self.context,
+            uuid=uuid4(),
+            id=123,
+            name="non-autoscale-group",
+            node_count=1,
+        )
+        self.cluster_obj.nodegroups.append(non_auto_scale_nodegroup)
+        auto_scale_nodegroup = obj_utils.create_test_nodegroup(
+            self.context,
+            uuid=uuid4(),
+            id=456,
+            name="autoscale-group",
+            node_count=1,
+            min_node_count=0,
+            max_node_count=3,
+        )
+        self.cluster_obj.nodegroups.append(auto_scale_nodegroup)
+        self.driver.create_cluster(self.context, self.cluster_obj, 10)
+        for ng in self.cluster_obj.nodegroups:
+            print(ng)
+
+        # Unpack some values for asserting against
+        helm_install_values = mock_install.call_args[0][3]
+        helm_node_groups = helm_install_values["nodeGroups"]
+        helm_values_default_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == self.cluster_obj.default_ng_worker.name
+        ]
+        helm_values_autoscale_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == auto_scale_nodegroup.name
+        ]
+        helm_values_non_autoscale_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == non_auto_scale_nodegroup.name
+        ]
+
+        # Check that node groups were passed into Helm values correctly
+        self.assertEqual(len(helm_values_default_nodegroup), 1)
+        self.assertEqual(len(helm_values_autoscale_nodegroup), 1)
+        self.assertEqual(len(helm_values_non_autoscale_nodegroup), 1)
+
+        # Check default node group values
+        ng = helm_values_default_nodegroup[0]
+        self.assertEqual(ng["autoscale"], "true")
+        self.assertEqual(
+            ng["machineCountMin"], auto_scale_labels["min_node_count"]
+        )
+        self.assertEqual(
+            ng["machineCountMax"], auto_scale_labels["max_node_count"]
+        )
+
+        # Check extra autoscaling node group values
+        # NOTE: CAPO doesn't support scale to zero so
+        # min node count should be max(1, ng.min_node_count)
+        ng = helm_values_autoscale_nodegroup[0]
+        self.assertEqual(ng["autoscale"], "true")
+        self.assertEqual(
+            ng["machineCountMin"], max(1, auto_scale_nodegroup.min_node_count)
+        )
+        self.assertEqual(
+            ng["machineCountMax"], auto_scale_nodegroup.max_node_count
+        )
+
+        # Check extra non-autoscaling node group values
+        ng = helm_values_non_autoscale_nodegroup[0]
+        self.assertEqual(ng.get("autoscale"), None)
+        self.assertEqual(ng.get("machineCountMin"), None)
+        self.assertEqual(ng.get("machineCountMax"), None)
--- a/releasenotes/notes/autoscaling-all-worker-groups-8c7a07d0f56ac2ca.yaml
+++ b/releasenotes/notes/autoscaling-all-worker-groups-8c7a07d0f56ac2ca.yaml
@ -0,0 +1,10 @@
+---
+features:
+  - |
+    Adds support for autoscaling of non-default worker node groups. The min/max
+    node count properties on each node group are passed to the autoscaler to
+    determine autoscaling behaviour. If the target cluster has the
+    `auto_scaling_enabled` cluster label set to `true` then any node groups with
+    both `min_node_count`` and `max_node_count` properties set will be allowed
+    to autoscale between these min and max node counts depending on the cluster's
+    current workload.