Allow autoscaling of non-default node groups

This change allows min/max node counts for all node groups to be passed through to the Cluster API Helm values so that the autoscaler will handle them correctly. It also adds some additional validation of node counts during node group creation.

Change-Id: I3f6ea5faefdd9affa2aad89dda7d70bbc890f970
This commit is contained in:
sd109 2024-05-20 10:38:57 +01:00
parent f5881c9388
commit 2300100462
3 changed files with 243 additions and 31 deletions

View File

@ -578,30 +578,20 @@ class Driver(driver.Driver):
def _get_autoheal_enabled(self, cluster):
return self._get_label_bool(cluster, "auto_healing_enabled", True)
def _get_autoscale(self, cluster, nodegroup):
auto_scale = self._get_label_bool(
cluster, "auto_scaling_enabled", False
def _get_autoscale_enabled(self, cluster):
return self._get_label_bool(cluster, "auto_scaling_enabled", False)
def _get_autoscale_values(self, cluster, nodegroup):
auto_scale = self._get_autoscale_enabled(cluster)
min_nodes, max_nodes = self._validate_allowed_node_counts(
cluster, nodegroup
)
if auto_scale:
auto_scale_args = dict(autoscale="true")
min_nodes = max(1, nodegroup.min_node_count)
max_nodes = self._get_label_int(
cluster, "max_node_count", min_nodes
)
if min_nodes > nodegroup.node_count:
raise exception.MagnumException(
message="min_node_count must be less than or equal to "
"default-worker nodegroup node_count."
)
auto_scale_args = {}
if auto_scale and max_nodes is not None:
auto_scale_args["autoscale"] = "true"
auto_scale_args["machineCountMin"] = min_nodes
if max_nodes < min_nodes:
raise exception.MagnumException(
message="max_node_count must be greater than or "
"equal to min_node_count"
)
auto_scale_args["machineCountMax"] = max_nodes
return auto_scale_args
return auto_scale
return auto_scale_args
def _get_k8s_keystone_auth_enabled(self, cluster):
return self._get_label_bool(cluster, "keystone_auth_enabled", False)
@ -645,6 +635,58 @@ class Driver(driver.Driver):
f"Minimum {CONF.capi_helm.minimum_flavor_ram} MB required."
)
def _is_default_worker_nodegroup(self, cluster, nodegroup):
return cluster.default_ng_worker.id == nodegroup.id
def _get_node_counts(self, cluster, nodegroup):
# ClusterAPI provider OpenStack (CAPO) doesn't
# support scale to zero yet
min_nodes = max(1, nodegroup.min_node_count)
max_nodes = nodegroup.max_node_count
# If min/max node counts are not defined on the default
# worker group then fall back to equivalent cluster labels
if self._is_default_worker_nodegroup(cluster, nodegroup):
# Magnum seems to set min_node_count = 1 on default group
# but we still want to be able to override that with labels
if min_nodes is None or min_nodes == 1:
min_nodes = self._get_label_int(cluster, "min_node_count", 1)
if not max_nodes:
max_nodes = self._get_label_int(
cluster, "max_node_count", min_nodes
)
return min_nodes, max_nodes
def _validate_allowed_node_counts(self, cluster, nodegroup):
min_nodes, max_nodes = self._get_node_counts(cluster, nodegroup)
LOG.debug(
f"Checking if node group {nodegroup.name} has valid "
f"node count parameters (count, min, max) = "
f"{(nodegroup.node_count, min_nodes, max_nodes)}"
)
if min_nodes is not None:
if min_nodes < 1:
raise exception.NodeGroupInvalidInput(
message="Min node count must be greater than "
"or equal to 1 for all node groups."
)
if min_nodes > nodegroup.node_count:
raise exception.NodeGroupInvalidInput(
message="Min node count must be less than "
"or equal to current node count"
)
if max_nodes is not None and max_nodes < min_nodes:
raise exception.NodeGroupInvalidInput(
message="Max node count must be greater than "
"or equal to min node count"
)
return min_nodes, max_nodes
def _get_csi_cinder_availability_zone(self, cluster):
return self._label(
cluster,
@ -757,13 +799,9 @@ class Driver(driver.Driver):
machineFlavor=ng.flavor_id,
machineCount=ng.node_count,
)
# Assume first nodegroup is default-worker.
if not nodegroup_set:
auto_scale = self._get_autoscale(cluster, ng)
if auto_scale:
nodegroup_item = helm.mergeconcat(
nodegroup_item, auto_scale
)
if self._get_autoscale_enabled(cluster):
values = self._get_autoscale_values(cluster, ng)
nodegroup_item = helm.mergeconcat(nodegroup_item, values)
nodegroup_set.append(nodegroup_item)
return nodegroup_set
@ -1037,6 +1075,8 @@ class Driver(driver.Driver):
def create_nodegroup(self, context, cluster, nodegroup):
nodegroup.status = fields.ClusterStatus.CREATE_IN_PROGRESS
self._validate_allowed_flavor(context, nodegroup.flavor_id)
if self._get_autoscale_enabled(cluster):
self._validate_allowed_node_counts(cluster, nodegroup)
nodegroup.save()
self._update_helm_release(context, cluster)
@ -1044,6 +1084,8 @@ class Driver(driver.Driver):
def update_nodegroup(self, context, cluster, nodegroup):
nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
self._validate_allowed_flavor(context, nodegroup.flavor_id)
if self._get_autoscale_enabled(cluster):
self._validate_allowed_node_counts(cluster, nodegroup)
nodegroup.save()
self._update_helm_release(context, cluster)

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from unittest import mock
from uuid import uuid4
from magnum.common import exception
from magnum.common import neutron
@ -2702,7 +2703,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
)
self.assertRaises(
exception.MagnumException,
self.driver._get_autoscale,
self.driver._get_autoscale_values,
self.cluster_obj,
self.cluster_obj.nodegroups[0],
)
@ -2755,10 +2756,9 @@ class ClusterAPIDriverTest(base.DbTestCase):
helm_install_values["nodeGroups"][0]["autoscale"],
auto_scale_labels["auto_scaling_enabled"],
)
# min_node_count is hardcode to max(1, ng.min_node_count)
self.assertEqual(
helm_install_values["nodeGroups"][0]["machineCountMin"],
self.cluster_obj.nodegroups[0].min_node_count,
auto_scale_labels["min_node_count"],
)
self.assertEqual(
helm_install_values["nodeGroups"][0]["machineCountMax"],
@ -2821,3 +2821,163 @@ class ClusterAPIDriverTest(base.DbTestCase):
"machineCountMax",
helm_install_values["nodeGroups"][0],
)
@mock.patch.object(
driver.Driver,
"_storageclass_definitions",
return_value=mock.ANY,
)
@mock.patch.object(driver.Driver, "_validate_allowed_flavor")
@mock.patch.object(neutron, "get_network", autospec=True)
@mock.patch.object(
driver.Driver, "_ensure_certificate_secrets", autospec=True
)
@mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
@mock.patch.object(kubernetes.Client, "load", autospec=True)
@mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
@mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
def test_nodegroup_node_count_validation(
self,
mock_install,
mock_image,
mock_load,
mock_appcred,
mock_certs,
mock_get_net,
mock_validate_allowed_flavor,
mock_storageclasses,
):
auto_scale_labels = dict(
auto_scaling_enabled="true", min_node_count=2, max_node_count=6
)
self.cluster_obj.labels = auto_scale_labels
mock_image.return_value = (
"imageid1",
"1.27.4",
"ubuntu",
)
# Create cluster with extra node groups
ng = obj_utils.create_test_nodegroup(
self.context, min_node_count=3, max_node_count=2
)
self.cluster_obj.nodegroups.append(ng)
self.assertRaises(
exception.NodeGroupInvalidInput,
self.driver.create_cluster,
self.context,
self.cluster_obj,
"not-used",
)
@mock.patch.object(
driver.Driver,
"_storageclass_definitions",
return_value=mock.ANY,
)
@mock.patch.object(driver.Driver, "_validate_allowed_flavor")
@mock.patch.object(neutron, "get_network", autospec=True)
@mock.patch.object(
driver.Driver, "_ensure_certificate_secrets", autospec=True
)
@mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
@mock.patch.object(kubernetes.Client, "load", autospec=True)
@mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
@mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
def test_create_extra_nodegroup_auto_scale_values(
self,
mock_install,
mock_image,
mock_load,
mock_appcred,
mock_certs,
mock_get_net,
mock_validate_allowed_flavor,
mock_storageclasses,
):
auto_scale_labels = dict(
auto_scaling_enabled="true", min_node_count=2, max_node_count=6
)
self.cluster_obj.labels = auto_scale_labels
mock_image.return_value = (
"imageid1",
"1.27.4",
"ubuntu",
)
# Create cluster with extra node groups
non_auto_scale_nodegroup = obj_utils.create_test_nodegroup(
self.context,
uuid=uuid4(),
id=123,
name="non-autoscale-group",
node_count=1,
)
self.cluster_obj.nodegroups.append(non_auto_scale_nodegroup)
auto_scale_nodegroup = obj_utils.create_test_nodegroup(
self.context,
uuid=uuid4(),
id=456,
name="autoscale-group",
node_count=1,
min_node_count=0,
max_node_count=3,
)
self.cluster_obj.nodegroups.append(auto_scale_nodegroup)
self.driver.create_cluster(self.context, self.cluster_obj, 10)
for ng in self.cluster_obj.nodegroups:
print(ng)
# Unpack some values for asserting against
helm_install_values = mock_install.call_args[0][3]
helm_node_groups = helm_install_values["nodeGroups"]
helm_values_default_nodegroup = [
ng
for ng in helm_node_groups
if ng["name"] == self.cluster_obj.default_ng_worker.name
]
helm_values_autoscale_nodegroup = [
ng
for ng in helm_node_groups
if ng["name"] == auto_scale_nodegroup.name
]
helm_values_non_autoscale_nodegroup = [
ng
for ng in helm_node_groups
if ng["name"] == non_auto_scale_nodegroup.name
]
# Check that node groups were passed into Helm values correctly
self.assertEqual(len(helm_values_default_nodegroup), 1)
self.assertEqual(len(helm_values_autoscale_nodegroup), 1)
self.assertEqual(len(helm_values_non_autoscale_nodegroup), 1)
# Check default node group values
ng = helm_values_default_nodegroup[0]
self.assertEqual(ng["autoscale"], "true")
self.assertEqual(
ng["machineCountMin"], auto_scale_labels["min_node_count"]
)
self.assertEqual(
ng["machineCountMax"], auto_scale_labels["max_node_count"]
)
# Check extra autoscaling node group values
# NOTE: CAPO doesn't support scale to zero so
# min node count should be max(1, ng.min_node_count)
ng = helm_values_autoscale_nodegroup[0]
self.assertEqual(ng["autoscale"], "true")
self.assertEqual(
ng["machineCountMin"], max(1, auto_scale_nodegroup.min_node_count)
)
self.assertEqual(
ng["machineCountMax"], auto_scale_nodegroup.max_node_count
)
# Check extra non-autoscaling node group values
ng = helm_values_non_autoscale_nodegroup[0]
self.assertEqual(ng.get("autoscale"), None)
self.assertEqual(ng.get("machineCountMin"), None)
self.assertEqual(ng.get("machineCountMax"), None)

View File

@ -0,0 +1,10 @@
---
features:
- |
Adds support for autoscaling of non-default worker node groups. The min/max
node count properties on each node group are passed to the autoscaler to
determine autoscaling behaviour. If the target cluster has the
`auto_scaling_enabled` cluster label set to `true` then any node groups with
both `min_node_count`` and `max_node_count` properties set will be allowed
to autoscale between these min and max node counts depending on the cluster's
current workload.