Merge "Allow autoscaling of non-default node groups"

2024-05-28 10:30:02 +00:00 · 2024-05-28 10:30:02 +00:00 · 1f1edec328
commit 1f1edec328
parent c5e8e14ba8 2300100462
3 changed files with 243 additions and 31 deletions
--- a/magnum_capi_helm/driver.py
+++ b/magnum_capi_helm/driver.py
@ -584,30 +584,20 @@ class Driver(driver.Driver):
    def _get_autoheal_enabled(self, cluster):
        return self._get_label_bool(cluster, "auto_healing_enabled", True)

-    def _get_autoscale(self, cluster, nodegroup):
-        auto_scale = self._get_label_bool(
-            cluster, "auto_scaling_enabled", False
+    def _get_autoscale_enabled(self, cluster):
+        return self._get_label_bool(cluster, "auto_scaling_enabled", False)
+
+    def _get_autoscale_values(self, cluster, nodegroup):
+        auto_scale = self._get_autoscale_enabled(cluster)
+        min_nodes, max_nodes = self._validate_allowed_node_counts(
+            cluster, nodegroup
        )
-        if auto_scale:
-            auto_scale_args = dict(autoscale="true")
-            min_nodes = max(1, nodegroup.min_node_count)
-            max_nodes = self._get_label_int(
-                cluster, "max_node_count", min_nodes
-            )
-            if min_nodes > nodegroup.node_count:
-                raise exception.MagnumException(
-                    message="min_node_count must be less than or equal to "
-                    "default-worker nodegroup node_count."
-                )
+        auto_scale_args = {}
+        if auto_scale and max_nodes is not None:
+            auto_scale_args["autoscale"] = "true"
            auto_scale_args["machineCountMin"] = min_nodes
-            if max_nodes < min_nodes:
-                raise exception.MagnumException(
-                    message="max_node_count must be greater than or "
-                    "equal to min_node_count"
-                )
            auto_scale_args["machineCountMax"] = max_nodes
-            return auto_scale_args
-        return auto_scale
+        return auto_scale_args

    def _get_k8s_keystone_auth_enabled(self, cluster):
        return self._get_label_bool(cluster, "keystone_auth_enabled", False)
@ -651,6 +641,58 @@ class Driver(driver.Driver):
            f"Minimum {CONF.capi_helm.minimum_flavor_ram} MB required."
        )

+    def _is_default_worker_nodegroup(self, cluster, nodegroup):
+        return cluster.default_ng_worker.id == nodegroup.id
+
+    def _get_node_counts(self, cluster, nodegroup):
+
+        # ClusterAPI provider OpenStack (CAPO) doesn't
+        # support scale to zero yet
+        min_nodes = max(1, nodegroup.min_node_count)
+        max_nodes = nodegroup.max_node_count
+
+        # If min/max node counts are not defined on the default
+        # worker group then fall back to equivalent cluster labels
+        if self._is_default_worker_nodegroup(cluster, nodegroup):
+            # Magnum seems to set min_node_count = 1 on default group
+            # but we still want to be able to override that with labels
+            if min_nodes is None or min_nodes == 1:
+                min_nodes = self._get_label_int(cluster, "min_node_count", 1)
+            if not max_nodes:
+                max_nodes = self._get_label_int(
+                    cluster, "max_node_count", min_nodes
+                )
+
+        return min_nodes, max_nodes
+
+    def _validate_allowed_node_counts(self, cluster, nodegroup):
+        min_nodes, max_nodes = self._get_node_counts(cluster, nodegroup)
+
+        LOG.debug(
+            f"Checking if node group {nodegroup.name} has valid "
+            f"node count parameters (count, min, max) = "
+            f"{(nodegroup.node_count, min_nodes, max_nodes)}"
+        )
+
+        if min_nodes is not None:
+            if min_nodes < 1:
+                raise exception.NodeGroupInvalidInput(
+                    message="Min node count must be greater than "
+                    "or equal to 1 for all node groups."
+                )
+            if min_nodes > nodegroup.node_count:
+                raise exception.NodeGroupInvalidInput(
+                    message="Min node count must be less than "
+                    "or equal to current node count"
+                )
+            if max_nodes is not None and max_nodes < min_nodes:
+                raise exception.NodeGroupInvalidInput(
+                    message="Max node count must be greater than "
+                    "or equal to min node count"
+                )
+
+        return min_nodes, max_nodes
+
    def _get_csi_cinder_availability_zone(self, cluster):
        return self._label(
            cluster,
@ -763,13 +805,9 @@ class Driver(driver.Driver):
                    machineFlavor=ng.flavor_id,
                    machineCount=ng.node_count,
                )
-                # Assume first nodegroup is default-worker.
-                if not nodegroup_set:
-                    auto_scale = self._get_autoscale(cluster, ng)
-                    if auto_scale:
-                        nodegroup_item = helm.mergeconcat(
-                            nodegroup_item, auto_scale
-                        )
+                if self._get_autoscale_enabled(cluster):
+                    values = self._get_autoscale_values(cluster, ng)
+                    nodegroup_item = helm.mergeconcat(nodegroup_item, values)
                nodegroup_set.append(nodegroup_item)
        return nodegroup_set

@ -1048,6 +1086,8 @@ class Driver(driver.Driver):
    def create_nodegroup(self, context, cluster, nodegroup):
        nodegroup.status = fields.ClusterStatus.CREATE_IN_PROGRESS
        self._validate_allowed_flavor(context, nodegroup.flavor_id)
+        if self._get_autoscale_enabled(cluster):
+            self._validate_allowed_node_counts(cluster, nodegroup)
        nodegroup.save()

        self._update_helm_release(context, cluster)
@ -1055,6 +1095,8 @@ class Driver(driver.Driver):
    def update_nodegroup(self, context, cluster, nodegroup):
        nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
        self._validate_allowed_flavor(context, nodegroup.flavor_id)
+        if self._get_autoscale_enabled(cluster):
+            self._validate_allowed_node_counts(cluster, nodegroup)
        nodegroup.save()

        self._update_helm_release(context, cluster)
--- a/magnum_capi_helm/tests/test_driver.py
+++ b/magnum_capi_helm/tests/test_driver.py
@ -10,6 +10,7 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 from unittest import mock
+from uuid import uuid4

 from magnum.common import exception
 from magnum.common import neutron
@ -2702,7 +2703,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
        )
        self.assertRaises(
            exception.MagnumException,
-            self.driver._get_autoscale,
+            self.driver._get_autoscale_values,
            self.cluster_obj,
            self.cluster_obj.nodegroups[0],
        )
@ -2755,10 +2756,9 @@ class ClusterAPIDriverTest(base.DbTestCase):
            helm_install_values["nodeGroups"][0]["autoscale"],
            auto_scale_labels["auto_scaling_enabled"],
        )
-        # min_node_count is hardcode to max(1, ng.min_node_count)
        self.assertEqual(
            helm_install_values["nodeGroups"][0]["machineCountMin"],
-            self.cluster_obj.nodegroups[0].min_node_count,
+            auto_scale_labels["min_node_count"],
        )
        self.assertEqual(
            helm_install_values["nodeGroups"][0]["machineCountMax"],
@ -2821,3 +2821,163 @@ class ClusterAPIDriverTest(base.DbTestCase):
            "machineCountMax",
            helm_install_values["nodeGroups"][0],
        )
+
+    @mock.patch.object(
+        driver.Driver,
+        "_storageclass_definitions",
+        return_value=mock.ANY,
+    )
+    @mock.patch.object(driver.Driver, "_validate_allowed_flavor")
+    @mock.patch.object(neutron, "get_network", autospec=True)
+    @mock.patch.object(
+        driver.Driver, "_ensure_certificate_secrets", autospec=True
+    )
+    @mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
+    @mock.patch.object(kubernetes.Client, "load", autospec=True)
+    @mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
+    @mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
+    def test_nodegroup_node_count_validation(
+        self,
+        mock_install,
+        mock_image,
+        mock_load,
+        mock_appcred,
+        mock_certs,
+        mock_get_net,
+        mock_validate_allowed_flavor,
+        mock_storageclasses,
+    ):
+        auto_scale_labels = dict(
+            auto_scaling_enabled="true", min_node_count=2, max_node_count=6
+        )
+        self.cluster_obj.labels = auto_scale_labels
+
+        mock_image.return_value = (
+            "imageid1",
+            "1.27.4",
+            "ubuntu",
+        )
+
+        # Create cluster with extra node groups
+        ng = obj_utils.create_test_nodegroup(
+            self.context, min_node_count=3, max_node_count=2
+        )
+        self.cluster_obj.nodegroups.append(ng)
+        self.assertRaises(
+            exception.NodeGroupInvalidInput,
+            self.driver.create_cluster,
+            self.context,
+            self.cluster_obj,
+            "not-used",
+        )
+
+    @mock.patch.object(
+        driver.Driver,
+        "_storageclass_definitions",
+        return_value=mock.ANY,
+    )
+    @mock.patch.object(driver.Driver, "_validate_allowed_flavor")
+    @mock.patch.object(neutron, "get_network", autospec=True)
+    @mock.patch.object(
+        driver.Driver, "_ensure_certificate_secrets", autospec=True
+    )
+    @mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
+    @mock.patch.object(kubernetes.Client, "load", autospec=True)
+    @mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
+    @mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
+    def test_create_extra_nodegroup_auto_scale_values(
+        self,
+        mock_install,
+        mock_image,
+        mock_load,
+        mock_appcred,
+        mock_certs,
+        mock_get_net,
+        mock_validate_allowed_flavor,
+        mock_storageclasses,
+    ):
+        auto_scale_labels = dict(
+            auto_scaling_enabled="true", min_node_count=2, max_node_count=6
+        )
+        self.cluster_obj.labels = auto_scale_labels
+
+        mock_image.return_value = (
+            "imageid1",
+            "1.27.4",
+            "ubuntu",
+        )
+
+        # Create cluster with extra node groups
+        non_auto_scale_nodegroup = obj_utils.create_test_nodegroup(
+            self.context,
+            uuid=uuid4(),
+            id=123,
+            name="non-autoscale-group",
+            node_count=1,
+        )
+        self.cluster_obj.nodegroups.append(non_auto_scale_nodegroup)
+        auto_scale_nodegroup = obj_utils.create_test_nodegroup(
+            self.context,
+            uuid=uuid4(),
+            id=456,
+            name="autoscale-group",
+            node_count=1,
+            min_node_count=0,
+            max_node_count=3,
+        )
+        self.cluster_obj.nodegroups.append(auto_scale_nodegroup)
+        self.driver.create_cluster(self.context, self.cluster_obj, 10)
+        for ng in self.cluster_obj.nodegroups:
+            print(ng)
+
+        # Unpack some values for asserting against
+        helm_install_values = mock_install.call_args[0][3]
+        helm_node_groups = helm_install_values["nodeGroups"]
+        helm_values_default_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == self.cluster_obj.default_ng_worker.name
+        ]
+        helm_values_autoscale_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == auto_scale_nodegroup.name
+        ]
+        helm_values_non_autoscale_nodegroup = [
+            ng
+            for ng in helm_node_groups
+            if ng["name"] == non_auto_scale_nodegroup.name
+        ]
+
+        # Check that node groups were passed into Helm values correctly
+        self.assertEqual(len(helm_values_default_nodegroup), 1)
+        self.assertEqual(len(helm_values_autoscale_nodegroup), 1)
+        self.assertEqual(len(helm_values_non_autoscale_nodegroup), 1)
+
+        # Check default node group values
+        ng = helm_values_default_nodegroup[0]
+        self.assertEqual(ng["autoscale"], "true")
+        self.assertEqual(
+            ng["machineCountMin"], auto_scale_labels["min_node_count"]
+        )
+        self.assertEqual(
+            ng["machineCountMax"], auto_scale_labels["max_node_count"]
+        )
+
+        # Check extra autoscaling node group values
+        # NOTE: CAPO doesn't support scale to zero so
+        # min node count should be max(1, ng.min_node_count)
+        ng = helm_values_autoscale_nodegroup[0]
+        self.assertEqual(ng["autoscale"], "true")
+        self.assertEqual(
+            ng["machineCountMin"], max(1, auto_scale_nodegroup.min_node_count)
+        )
+        self.assertEqual(
+            ng["machineCountMax"], auto_scale_nodegroup.max_node_count
+        )
+
+        # Check extra non-autoscaling node group values
+        ng = helm_values_non_autoscale_nodegroup[0]
+        self.assertEqual(ng.get("autoscale"), None)
+        self.assertEqual(ng.get("machineCountMin"), None)
+        self.assertEqual(ng.get("machineCountMax"), None)
--- a/releasenotes/notes/autoscaling-all-worker-groups-8c7a07d0f56ac2ca.yaml
+++ b/releasenotes/notes/autoscaling-all-worker-groups-8c7a07d0f56ac2ca.yaml
@ -0,0 +1,10 @@
+---
+features:
+  - |
+    Adds support for autoscaling of non-default worker node groups. The min/max
+    node count properties on each node group are passed to the autoscaler to
+    determine autoscaling behaviour. If the target cluster has the
+    `auto_scaling_enabled` cluster label set to `true` then any node groups with
+    both `min_node_count`` and `max_node_count` properties set will be allowed
+    to autoscale between these min and max node counts depending on the cluster's
+    current workload.