From 2300100462ae5ebecd6f315100bb48b4ac6eabde Mon Sep 17 00:00:00 2001
From: sd109 <scott@stackhpc.com>
Date: Mon, 20 May 2024 10:38:57 +0100
Subject: [PATCH] Allow autoscaling of non-default node groups

This change allows min/max node counts for all node groups to be passed through to the Cluster API Helm values so that the autoscaler will handle them correctly. It also adds some additional validation of node counts during node group creation.

Change-Id: I3f6ea5faefdd9affa2aad89dda7d70bbc890f970
---
 magnum_capi_helm/driver.py                    |  98 ++++++++---
 magnum_capi_helm/tests/test_driver.py         | 166 +++++++++++++++++-
 ...ng-all-worker-groups-8c7a07d0f56ac2ca.yaml |  10 ++
 3 files changed, 243 insertions(+), 31 deletions(-)
 create mode 100644 releasenotes/notes/autoscaling-all-worker-groups-8c7a07d0f56ac2ca.yaml

diff --git a/magnum_capi_helm/driver.py b/magnum_capi_helm/driver.py
index 88e7028..8fdf6ac 100644
--- a/magnum_capi_helm/driver.py
+++ b/magnum_capi_helm/driver.py
@@ -578,30 +578,20 @@ class Driver(driver.Driver):
     def _get_autoheal_enabled(self, cluster):
         return self._get_label_bool(cluster, "auto_healing_enabled", True)
 
-    def _get_autoscale(self, cluster, nodegroup):
-        auto_scale = self._get_label_bool(
-            cluster, "auto_scaling_enabled", False
+    def _get_autoscale_enabled(self, cluster):
+        return self._get_label_bool(cluster, "auto_scaling_enabled", False)
+
+    def _get_autoscale_values(self, cluster, nodegroup):
+        auto_scale = self._get_autoscale_enabled(cluster)
+        min_nodes, max_nodes = self._validate_allowed_node_counts(
+            cluster, nodegroup
         )
-        if auto_scale:
-            auto_scale_args = dict(autoscale="true")
-            min_nodes = max(1, nodegroup.min_node_count)
-            max_nodes = self._get_label_int(
-                cluster, "max_node_count", min_nodes
-            )
-            if min_nodes > nodegroup.node_count:
-                raise exception.MagnumException(
-                    message="min_node_count must be less than or equal to "
-                    "default-worker nodegroup node_count."
-                )
+        auto_scale_args = {}
+        if auto_scale and max_nodes is not None:
+            auto_scale_args["autoscale"] = "true"
             auto_scale_args["machineCountMin"] = min_nodes
-            if max_nodes < min_nodes:
-                raise exception.MagnumException(
-                    message="max_node_count must be greater than or "
-                    "equal to min_node_count"
-                )
             auto_scale_args["machineCountMax"] = max_nodes
-            return auto_scale_args
-        return auto_scale
+        return auto_scale_args
 
     def _get_k8s_keystone_auth_enabled(self, cluster):
         return self._get_label_bool(cluster, "keystone_auth_enabled", False)
@@ -645,6 +635,58 @@ class Driver(driver.Driver):
             f"Minimum {CONF.capi_helm.minimum_flavor_ram} MB required."
         )
 
+    def _is_default_worker_nodegroup(self, cluster, nodegroup):
+        return cluster.default_ng_worker.id == nodegroup.id
+
+    def _get_node_counts(self, cluster, nodegroup):
+
+        # ClusterAPI provider OpenStack (CAPO) doesn't
+        # support scale to zero yet
+        min_nodes = max(1, nodegroup.min_node_count)
+        max_nodes = nodegroup.max_node_count
+
+        # If min/max node counts are not defined on the default
+        # worker group then fall back to equivalent cluster labels
+        if self._is_default_worker_nodegroup(cluster, nodegroup):
+            # Magnum seems to set min_node_count = 1 on default group
+            # but we still want to be able to override that with labels
+            if min_nodes is None or min_nodes == 1:
+                min_nodes = self._get_label_int(cluster, "min_node_count", 1)
+            if not max_nodes:
+                max_nodes = self._get_label_int(
+                    cluster, "max_node_count", min_nodes
+                )
+
+        return min_nodes, max_nodes
+
+    def _validate_allowed_node_counts(self, cluster, nodegroup):
+        min_nodes, max_nodes = self._get_node_counts(cluster, nodegroup)
+
+        LOG.debug(
+            f"Checking if node group {nodegroup.name} has valid "
+            f"node count parameters (count, min, max) = "
+            f"{(nodegroup.node_count, min_nodes, max_nodes)}"
+        )
+
+        if min_nodes is not None:
+            if min_nodes < 1:
+                raise exception.NodeGroupInvalidInput(
+                    message="Min node count must be greater than "
+                    "or equal to 1 for all node groups."
+                )
+            if min_nodes > nodegroup.node_count:
+                raise exception.NodeGroupInvalidInput(
+                    message="Min node count must be less than "
+                    "or equal to current node count"
+                )
+            if max_nodes is not None and max_nodes < min_nodes:
+                raise exception.NodeGroupInvalidInput(
+                    message="Max node count must be greater than "
+                    "or equal to min node count"
+                )
+
+        return min_nodes, max_nodes
+
     def _get_csi_cinder_availability_zone(self, cluster):
         return self._label(
             cluster,
@@ -757,13 +799,9 @@ class Driver(driver.Driver):
                     machineFlavor=ng.flavor_id,
                     machineCount=ng.node_count,
                 )
-                # Assume first nodegroup is default-worker.
-                if not nodegroup_set:
-                    auto_scale = self._get_autoscale(cluster, ng)
-                    if auto_scale:
-                        nodegroup_item = helm.mergeconcat(
-                            nodegroup_item, auto_scale
-                        )
+                if self._get_autoscale_enabled(cluster):
+                    values = self._get_autoscale_values(cluster, ng)
+                    nodegroup_item = helm.mergeconcat(nodegroup_item, values)
                 nodegroup_set.append(nodegroup_item)
         return nodegroup_set
 
@@ -1037,6 +1075,8 @@ class Driver(driver.Driver):
     def create_nodegroup(self, context, cluster, nodegroup):
         nodegroup.status = fields.ClusterStatus.CREATE_IN_PROGRESS
         self._validate_allowed_flavor(context, nodegroup.flavor_id)
+        if self._get_autoscale_enabled(cluster):
+            self._validate_allowed_node_counts(cluster, nodegroup)
         nodegroup.save()
 
         self._update_helm_release(context, cluster)
@@ -1044,6 +1084,8 @@ class Driver(driver.Driver):
     def update_nodegroup(self, context, cluster, nodegroup):
         nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
         self._validate_allowed_flavor(context, nodegroup.flavor_id)
+        if self._get_autoscale_enabled(cluster):
+            self._validate_allowed_node_counts(cluster, nodegroup)
         nodegroup.save()
 
         self._update_helm_release(context, cluster)
diff --git a/magnum_capi_helm/tests/test_driver.py b/magnum_capi_helm/tests/test_driver.py
index ca48c75..56cf5af 100644
--- a/magnum_capi_helm/tests/test_driver.py
+++ b/magnum_capi_helm/tests/test_driver.py
@@ -10,6 +10,7 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 from unittest import mock
+from uuid import uuid4
 
 from magnum.common import exception
 from magnum.common import neutron
@@ -2702,7 +2703,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
         )
         self.assertRaises(
             exception.MagnumException,
-            self.driver._get_autoscale,
+            self.driver._get_autoscale_values,
             self.cluster_obj,
             self.cluster_obj.nodegroups[0],
         )
@@ -2755,10 +2756,9 @@ class ClusterAPIDriverTest(base.DbTestCase):
             helm_install_values["nodeGroups"][0]["autoscale"],
             auto_scale_labels["auto_scaling_enabled"],
         )
-        # min_node_count is hardcode to max(1, ng.min_node_count)
         self.assertEqual(
             helm_install_values["nodeGroups"][0]["machineCountMin"],
-            self.cluster_obj.nodegroups[0].min_node_count,
+            auto_scale_labels["min_node_count"],
         )
         self.assertEqual(
             helm_install_values["nodeGroups"][0]["machineCountMax"],
@@ -2821,3 +2821,163 @@ class ClusterAPIDriverTest(base.DbTestCase):
             "machineCountMax",
             helm_install_values["nodeGroups"][0],
         )
+
+    @mock.patch.object(
+        driver.Driver,
+        "_storageclass_definitions",
+        return_value=mock.ANY,
+    )
+    @mock.patch.object(driver.Driver, "_validate_allowed_flavor")
+    @mock.patch.object(neutron, "get_network", autospec=True)
+    @mock.patch.object(
+        driver.Driver, "_ensure_certificate_secrets", autospec=True
+    )
+    @mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
+    @mock.patch.object(kubernetes.Client, "load", autospec=True)
+    @mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
+    @mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
+    def test_nodegroup_node_count_validation(
+        self,
+        mock_install,
+        mock_image,
+        mock_load,
+        mock_appcred,
+        mock_certs,
+        mock_get_net,
+        mock_validate_allowed_flavor,
+        mock_storageclasses,
+    ):
+        auto_scale_labels = dict(
+            auto_scaling_enabled="true", min_node_count=2, max_node_count=6
+        )
+        self.cluster_obj.labels = auto_scale_labels
+
+        mock_image.return_value = (
+            "imageid1",
+            "1.27.4",
+            "ubuntu",
+        )
+
+        # Create cluster with extra node groups
+        ng = obj_utils.create_test_nodegroup(
+            self.context, min_node_count=3, max_node_count=2
+        )
+        self.cluster_obj.nodegroups.append(ng)
+        self.assertRaises(
+            exception.NodeGroupInvalidInput,
+            self.driver.create_cluster,
+            self.context,
+            self.cluster_obj,
+            "not-used",
+        )
+
+    @mock.patch.object(
+        driver.Driver,
+        "_storageclass_definitions",
+        return_value=mock.ANY,
+    )
+    @mock.patch.object(driver.Driver, "_validate_allowed_flavor")
+    @mock.patch.object(neutron, "get_network", autospec=True)
+    @mock.patch.object(
+        driver.Driver, "_ensure_certificate_secrets", autospec=True
+    )
+    @mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
+    @mock.patch.object(kubernetes.Client, "load", autospec=True)
+    @mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
+    @mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
+    def test_create_extra_nodegroup_auto_scale_values(
+        self,
+        mock_install,
+        mock_image,
+        mock_load,
+        mock_appcred,
+        mock_certs,
+        mock_get_net,
+        mock_validate_allowed_flavor,
+        mock_storageclasses,
+    ):
+        auto_scale_labels = dict(
+            auto_scaling_enabled="true", min_node_count=2, max_node_count=6
+        )
+        self.cluster_obj.labels = auto_scale_labels
+
+        mock_image.return_value = (
+            "imageid1",
+            "1.27.4",
+            "ubuntu",
+        )
+
+        # Create cluster with extra node groups
+        non_auto_scale_nodegroup = obj_utils.create_test_nodegroup(
+            self.context,
+            uuid=uuid4(),
+            id=123,
+            name="non-autoscale-group",
+            node_count=1,
+        )
+        self.cluster_obj.nodegroups.append(non_auto_scale_nodegroup)
+        auto_scale_nodegroup = obj_utils.create_test_nodegroup(
+            self.context,
+            uuid=uuid4(),
+            id=456,
+            name="autoscale-group",
+            node_count=1,
+            min_node_count=0,
+            max_node_count=3,
+        )
+        self.cluster_obj.nodegroups.append(auto_scale_nodegroup)
+        self.driver.create_cluster(self.context, self.cluster_obj, 10)
+        for ng in self.cluster_obj.nodegroups:
+            print(ng)
+
+        # Unpack some values for asserting against
+        helm_install_values = mock_install.call_args[0][3]
+        helm_node_groups = helm_install_values["nodeGroups"]
+        helm_values_default_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == self.cluster_obj.default_ng_worker.name
+        ]
+        helm_values_autoscale_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == auto_scale_nodegroup.name
+        ]
+        helm_values_non_autoscale_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == non_auto_scale_nodegroup.name
+        ]
+
+        # Check that node groups were passed into Helm values correctly
+        self.assertEqual(len(helm_values_default_nodegroup), 1)
+        self.assertEqual(len(helm_values_autoscale_nodegroup), 1)
+        self.assertEqual(len(helm_values_non_autoscale_nodegroup), 1)
+
+        # Check default node group values
+        ng = helm_values_default_nodegroup[0]
+        self.assertEqual(ng["autoscale"], "true")
+        self.assertEqual(
+            ng["machineCountMin"], auto_scale_labels["min_node_count"]
+        )
+        self.assertEqual(
+            ng["machineCountMax"], auto_scale_labels["max_node_count"]
+        )
+
+        # Check extra autoscaling node group values
+        # NOTE: CAPO doesn't support scale to zero so
+        # min node count should be max(1, ng.min_node_count)
+        ng = helm_values_autoscale_nodegroup[0]
+        self.assertEqual(ng["autoscale"], "true")
+        self.assertEqual(
+            ng["machineCountMin"], max(1, auto_scale_nodegroup.min_node_count)
+        )
+        self.assertEqual(
+            ng["machineCountMax"], auto_scale_nodegroup.max_node_count
+        )
+
+        # Check extra non-autoscaling node group values
+        ng = helm_values_non_autoscale_nodegroup[0]
+        self.assertEqual(ng.get("autoscale"), None)
+        self.assertEqual(ng.get("machineCountMin"), None)
+        self.assertEqual(ng.get("machineCountMax"), None)
diff --git a/releasenotes/notes/autoscaling-all-worker-groups-8c7a07d0f56ac2ca.yaml b/releasenotes/notes/autoscaling-all-worker-groups-8c7a07d0f56ac2ca.yaml
new file mode 100644
index 0000000..3ef9a24
--- /dev/null
+++ b/releasenotes/notes/autoscaling-all-worker-groups-8c7a07d0f56ac2ca.yaml
@@ -0,0 +1,10 @@
+---
+features:
+  - |
+    Adds support for autoscaling of non-default worker node groups. The min/max
+    node count properties on each node group are passed to the autoscaler to
+    determine autoscaling behaviour. If the target cluster has the
+    `auto_scaling_enabled` cluster label set to `true` then any node groups with
+    both `min_node_count`` and `max_node_count` properties set will be allowed
+    to autoscale between these min and max node counts depending on the cluster's
+    current workload.