Improve default min/max node count behaviour when autoscaling is enabled

This change fixes an issue where creating a cluster with the 2 or more nodes in the default worker group and with `auto_scaling_enabled=true` but with neither `{min,max}_node_count` labels set would lead to a situation where the default worker group would only have 1 node instead of 2. The reason for this seems to be that the `min_node_count` and `max_node_count` values which are passed through to CAPI take priority over the initial `node_count` field and the min count currently defaults to 1. In these cases, I think the expected user behaviour is to have `node_count` == `min_node_count` == `max_node_count` and for autoscaling to be disabled on the default node group. Change-Id: I65e65bf64c97cf5ab659d6d7d158c32692a63348
2024-07-16 16:52:01 +01:00 · 2024-07-16 16:52:01 +01:00 · 318b1269ef
commit 318b1269ef
parent 957ce61b3c
3 changed files with 85 additions and 14 deletions
--- a/magnum_capi_helm/driver.py
+++ b/magnum_capi_helm/driver.py
@ -593,7 +593,7 @@ class Driver(driver.Driver):
            cluster, nodegroup
        )
        auto_scale_args = {}
-        if auto_scale and max_nodes is not None:
+        if auto_scale and min_nodes != max_nodes:
            auto_scale_args["autoscale"] = "true"
            auto_scale_args["machineCountMin"] = min_nodes
            auto_scale_args["machineCountMax"] = max_nodes
@ -646,22 +646,37 @@ class Driver(driver.Driver):

    def _get_node_counts(self, cluster, nodegroup):

-        # ClusterAPI provider OpenStack (CAPO) doesn't
-        # support scale to zero yet
-        min_nodes = max(1, nodegroup.min_node_count)
-        max_nodes = nodegroup.max_node_count
+        # NOTE(scott): In CAPI MachineDeployment resources created by the
+        # capi-helm-charts, the `replicas` field is omitted when autoscaling
+        # is enabled (since we're relinquishing control over node count to
+        # the autoscaler) so if a user creates a nodegroup where only
+        # node_count is provided and min/max are not, we need to be careful
+        # about setting the default min/max values to equal node_count.
+        min_nodes = nodegroup.node_count
+        max_nodes = nodegroup.node_count
+
+        if nodegroup.min_node_count is not None:
+            min_nodes = nodegroup.min_node_count
+        if nodegroup.max_node_count is not None:
+            max_nodes = nodegroup.max_node_count

        # If min/max node counts are not defined on the default
        # worker group then fall back to equivalent cluster labels
        if self._is_default_worker_nodegroup(cluster, nodegroup):
-            # Magnum seems to set min_node_count = 1 on default group
-            # but we still want to be able to override that with labels
-            if min_nodes is None or min_nodes == 1:
-                min_nodes = self._get_label_int(cluster, "min_node_count", 1)
-            if not max_nodes:
-                max_nodes = self._get_label_int(
-                    cluster, "max_node_count", min_nodes
-                )
+            # NOTE(scott): Magnum seems to set min_node_count = 1
+            # on the default group so treat this as if it were None
+            if nodegroup.min_node_count == 1:
+                min_nodes = nodegroup.node_count
+
+            # We still want to be able to override the default node
+            # group values with labels for consistent behaviour with
+            # Magnum Heat driver.
+            min_nodes = self._get_label_int(
+                cluster, "min_node_count", min_nodes
+            )
+            max_nodes = self._get_label_int(
+                cluster, "max_node_count", max_nodes
+            )

        return min_nodes, max_nodes

@ -675,6 +690,8 @@ class Driver(driver.Driver):
        )

        if min_nodes is not None:
+            # ClusterAPI Provider OpenStack (CAPO)
+            # doesn't support scale to zero yet.
            if min_nodes < 1:
                raise exception.NodeGroupInvalidInput(
                    message="Min node count must be greater than "
--- a/magnum_capi_helm/tests/test_driver.py
+++ b/magnum_capi_helm/tests/test_driver.py
@ -2922,7 +2922,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
            id=456,
            name="autoscale-group",
            node_count=1,
-            min_node_count=0,
+            min_node_count=1,
            max_node_count=3,
        )
        self.cluster_obj.nodegroups.append(auto_scale_nodegroup)
@ -3143,3 +3143,49 @@ class ClusterAPIDriverTest(base.DbTestCase):
            ],
            disk_size_configuration_value,
        )
+
+    @mock.patch.object(
+        driver.Driver,
+        "_storageclass_definitions",
+        return_value=mock.ANY,
+    )
+    @mock.patch.object(driver.Driver, "_validate_allowed_flavor")
+    @mock.patch.object(neutron, "get_network", autospec=True)
+    @mock.patch.object(
+        driver.Driver, "_ensure_certificate_secrets", autospec=True
+    )
+    @mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
+    @mock.patch.object(kubernetes.Client, "load", autospec=True)
+    @mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
+    @mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
+    def test_nodegroup_node_count_min_max_equal(
+        self,
+        mock_install,
+        mock_image,
+        mock_load,
+        mock_appcred,
+        mock_certs,
+        mock_get_net,
+        mock_validate_allowed_flavor,
+        mock_storageclasses,
+    ):
+        # When autoscaling is enabled but no min/max node counts are
+        # provided for the default node group, we want autoscaling to
+        # be disabled on the default node group.
+        self.cluster_obj.labels = {"auto_scaling_enabled": "true"}
+        mock_image.return_value = (
+            "imageid1",
+            "1.27.4",
+            "ubuntu",
+        )
+        self.driver.create_cluster(self.context, self.cluster_obj, 10)
+        helm_install_values = mock_install.call_args[0][3]
+        helm_values_default_ng = next(
+            ng
+            for ng in helm_install_values["nodeGroups"]
+            if ng["name"] == self.cluster_obj.default_ng_worker.name
+        )
+        self.assertEqual(helm_values_default_ng.get("autoscale"), None)
+        self.assertEqual(helm_values_default_ng.get("machineCountMin"), None)
+        self.assertEqual(helm_values_default_ng.get("machineCountMax"), None)
+        self.assertEqual(helm_values_default_ng.get("machineCount"), 3)
--- a/releasenotes/notes/autoscaling-min-max-defaults-0a0385bea438bc46.yaml
+++ b/releasenotes/notes/autoscaling-min-max-defaults-0a0385bea438bc46.yaml
@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Fixes an issue where creating a cluster with the 2 or more
+    nodes in the default worker group and with `auto_scaling_enabled=true`
+    but with neither `{min,max}_node_count` labels set would lead
+    to a situation where the default worker group would only have 1
+    node instead of 2.