Improve default min/max node count behaviour when autoscaling is enabled

This change fixes an issue where creating a cluster with the 2 or more
nodes in the default worker group and with `auto_scaling_enabled=true`
but with neither `{min,max}_node_count` labels set would lead to a
situation where the default worker group would only have 1 node instead
of 2.

The reason for this seems to be that the `min_node_count` and
`max_node_count` values which are passed through to CAPI take priority
over the initial `node_count` field and the min count currently defaults
to 1.

In these cases, I think the expected user behaviour is to have
`node_count` == `min_node_count` == `max_node_count` and for autoscaling
to be disabled on the default node group.

Change-Id: I65e65bf64c97cf5ab659d6d7d158c32692a63348
This commit is contained in:
sd109 2024-07-16 16:52:01 +01:00
parent 957ce61b3c
commit 318b1269ef
3 changed files with 85 additions and 14 deletions

View File

@ -593,7 +593,7 @@ class Driver(driver.Driver):
cluster, nodegroup
)
auto_scale_args = {}
if auto_scale and max_nodes is not None:
if auto_scale and min_nodes != max_nodes:
auto_scale_args["autoscale"] = "true"
auto_scale_args["machineCountMin"] = min_nodes
auto_scale_args["machineCountMax"] = max_nodes
@ -646,22 +646,37 @@ class Driver(driver.Driver):
def _get_node_counts(self, cluster, nodegroup):
# ClusterAPI provider OpenStack (CAPO) doesn't
# support scale to zero yet
min_nodes = max(1, nodegroup.min_node_count)
max_nodes = nodegroup.max_node_count
# NOTE(scott): In CAPI MachineDeployment resources created by the
# capi-helm-charts, the `replicas` field is omitted when autoscaling
# is enabled (since we're relinquishing control over node count to
# the autoscaler) so if a user creates a nodegroup where only
# node_count is provided and min/max are not, we need to be careful
# about setting the default min/max values to equal node_count.
min_nodes = nodegroup.node_count
max_nodes = nodegroup.node_count
if nodegroup.min_node_count is not None:
min_nodes = nodegroup.min_node_count
if nodegroup.max_node_count is not None:
max_nodes = nodegroup.max_node_count
# If min/max node counts are not defined on the default
# worker group then fall back to equivalent cluster labels
if self._is_default_worker_nodegroup(cluster, nodegroup):
# Magnum seems to set min_node_count = 1 on default group
# but we still want to be able to override that with labels
if min_nodes is None or min_nodes == 1:
min_nodes = self._get_label_int(cluster, "min_node_count", 1)
if not max_nodes:
max_nodes = self._get_label_int(
cluster, "max_node_count", min_nodes
)
# NOTE(scott): Magnum seems to set min_node_count = 1
# on the default group so treat this as if it were None
if nodegroup.min_node_count == 1:
min_nodes = nodegroup.node_count
# We still want to be able to override the default node
# group values with labels for consistent behaviour with
# Magnum Heat driver.
min_nodes = self._get_label_int(
cluster, "min_node_count", min_nodes
)
max_nodes = self._get_label_int(
cluster, "max_node_count", max_nodes
)
return min_nodes, max_nodes
@ -675,6 +690,8 @@ class Driver(driver.Driver):
)
if min_nodes is not None:
# ClusterAPI Provider OpenStack (CAPO)
# doesn't support scale to zero yet.
if min_nodes < 1:
raise exception.NodeGroupInvalidInput(
message="Min node count must be greater than "

View File

@ -2922,7 +2922,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
id=456,
name="autoscale-group",
node_count=1,
min_node_count=0,
min_node_count=1,
max_node_count=3,
)
self.cluster_obj.nodegroups.append(auto_scale_nodegroup)
@ -3143,3 +3143,49 @@ class ClusterAPIDriverTest(base.DbTestCase):
],
disk_size_configuration_value,
)
@mock.patch.object(
driver.Driver,
"_storageclass_definitions",
return_value=mock.ANY,
)
@mock.patch.object(driver.Driver, "_validate_allowed_flavor")
@mock.patch.object(neutron, "get_network", autospec=True)
@mock.patch.object(
driver.Driver, "_ensure_certificate_secrets", autospec=True
)
@mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
@mock.patch.object(kubernetes.Client, "load", autospec=True)
@mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
@mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
def test_nodegroup_node_count_min_max_equal(
self,
mock_install,
mock_image,
mock_load,
mock_appcred,
mock_certs,
mock_get_net,
mock_validate_allowed_flavor,
mock_storageclasses,
):
# When autoscaling is enabled but no min/max node counts are
# provided for the default node group, we want autoscaling to
# be disabled on the default node group.
self.cluster_obj.labels = {"auto_scaling_enabled": "true"}
mock_image.return_value = (
"imageid1",
"1.27.4",
"ubuntu",
)
self.driver.create_cluster(self.context, self.cluster_obj, 10)
helm_install_values = mock_install.call_args[0][3]
helm_values_default_ng = next(
ng
for ng in helm_install_values["nodeGroups"]
if ng["name"] == self.cluster_obj.default_ng_worker.name
)
self.assertEqual(helm_values_default_ng.get("autoscale"), None)
self.assertEqual(helm_values_default_ng.get("machineCountMin"), None)
self.assertEqual(helm_values_default_ng.get("machineCountMax"), None)
self.assertEqual(helm_values_default_ng.get("machineCount"), 3)

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes an issue where creating a cluster with the 2 or more
nodes in the default worker group and with `auto_scaling_enabled=true`
but with neither `{min,max}_node_count` labels set would lead
to a situation where the default worker group would only have 1
node instead of 2.