Merge "Allow autoscaling of non-default node groups"

This commit is contained in:
Zuul 2024-05-28 10:30:02 +00:00 committed by Gerrit Code Review
commit 1f1edec328
3 changed files with 243 additions and 31 deletions

View File

@ -584,30 +584,20 @@ class Driver(driver.Driver):
def _get_autoheal_enabled(self, cluster):
return self._get_label_bool(cluster, "auto_healing_enabled", True)
def _get_autoscale(self, cluster, nodegroup):
auto_scale = self._get_label_bool(
cluster, "auto_scaling_enabled", False
def _get_autoscale_enabled(self, cluster):
return self._get_label_bool(cluster, "auto_scaling_enabled", False)
def _get_autoscale_values(self, cluster, nodegroup):
auto_scale = self._get_autoscale_enabled(cluster)
min_nodes, max_nodes = self._validate_allowed_node_counts(
cluster, nodegroup
)
if auto_scale:
auto_scale_args = dict(autoscale="true")
min_nodes = max(1, nodegroup.min_node_count)
max_nodes = self._get_label_int(
cluster, "max_node_count", min_nodes
)
if min_nodes > nodegroup.node_count:
raise exception.MagnumException(
message="min_node_count must be less than or equal to "
"default-worker nodegroup node_count."
)
auto_scale_args = {}
if auto_scale and max_nodes is not None:
auto_scale_args["autoscale"] = "true"
auto_scale_args["machineCountMin"] = min_nodes
if max_nodes < min_nodes:
raise exception.MagnumException(
message="max_node_count must be greater than or "
"equal to min_node_count"
)
auto_scale_args["machineCountMax"] = max_nodes
return auto_scale_args
return auto_scale
return auto_scale_args
def _get_k8s_keystone_auth_enabled(self, cluster):
return self._get_label_bool(cluster, "keystone_auth_enabled", False)
@ -651,6 +641,58 @@ class Driver(driver.Driver):
f"Minimum {CONF.capi_helm.minimum_flavor_ram} MB required."
)
def _is_default_worker_nodegroup(self, cluster, nodegroup):
return cluster.default_ng_worker.id == nodegroup.id
def _get_node_counts(self, cluster, nodegroup):
# ClusterAPI provider OpenStack (CAPO) doesn't
# support scale to zero yet
min_nodes = max(1, nodegroup.min_node_count)
max_nodes = nodegroup.max_node_count
# If min/max node counts are not defined on the default
# worker group then fall back to equivalent cluster labels
if self._is_default_worker_nodegroup(cluster, nodegroup):
# Magnum seems to set min_node_count = 1 on default group
# but we still want to be able to override that with labels
if min_nodes is None or min_nodes == 1:
min_nodes = self._get_label_int(cluster, "min_node_count", 1)
if not max_nodes:
max_nodes = self._get_label_int(
cluster, "max_node_count", min_nodes
)
return min_nodes, max_nodes
def _validate_allowed_node_counts(self, cluster, nodegroup):
min_nodes, max_nodes = self._get_node_counts(cluster, nodegroup)
LOG.debug(
f"Checking if node group {nodegroup.name} has valid "
f"node count parameters (count, min, max) = "
f"{(nodegroup.node_count, min_nodes, max_nodes)}"
)
if min_nodes is not None:
if min_nodes < 1:
raise exception.NodeGroupInvalidInput(
message="Min node count must be greater than "
"or equal to 1 for all node groups."
)
if min_nodes > nodegroup.node_count:
raise exception.NodeGroupInvalidInput(
message="Min node count must be less than "
"or equal to current node count"
)
if max_nodes is not None and max_nodes < min_nodes:
raise exception.NodeGroupInvalidInput(
message="Max node count must be greater than "
"or equal to min node count"
)
return min_nodes, max_nodes
def _get_csi_cinder_availability_zone(self, cluster):
return self._label(
cluster,
@ -763,13 +805,9 @@ class Driver(driver.Driver):
machineFlavor=ng.flavor_id,
machineCount=ng.node_count,
)
# Assume first nodegroup is default-worker.
if not nodegroup_set:
auto_scale = self._get_autoscale(cluster, ng)
if auto_scale:
nodegroup_item = helm.mergeconcat(
nodegroup_item, auto_scale
)
if self._get_autoscale_enabled(cluster):
values = self._get_autoscale_values(cluster, ng)
nodegroup_item = helm.mergeconcat(nodegroup_item, values)
nodegroup_set.append(nodegroup_item)
return nodegroup_set
@ -1048,6 +1086,8 @@ class Driver(driver.Driver):
def create_nodegroup(self, context, cluster, nodegroup):
nodegroup.status = fields.ClusterStatus.CREATE_IN_PROGRESS
self._validate_allowed_flavor(context, nodegroup.flavor_id)
if self._get_autoscale_enabled(cluster):
self._validate_allowed_node_counts(cluster, nodegroup)
nodegroup.save()
self._update_helm_release(context, cluster)
@ -1055,6 +1095,8 @@ class Driver(driver.Driver):
def update_nodegroup(self, context, cluster, nodegroup):
nodegroup.status = fields.ClusterStatus.UPDATE_IN_PROGRESS
self._validate_allowed_flavor(context, nodegroup.flavor_id)
if self._get_autoscale_enabled(cluster):
self._validate_allowed_node_counts(cluster, nodegroup)
nodegroup.save()
self._update_helm_release(context, cluster)

View File

@ -10,6 +10,7 @@
# License for the specific language governing permissions and limitations
# under the License.
from unittest import mock
from uuid import uuid4
from magnum.common import exception
from magnum.common import neutron
@ -2702,7 +2703,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
)
self.assertRaises(
exception.MagnumException,
self.driver._get_autoscale,
self.driver._get_autoscale_values,
self.cluster_obj,
self.cluster_obj.nodegroups[0],
)
@ -2755,10 +2756,9 @@ class ClusterAPIDriverTest(base.DbTestCase):
helm_install_values["nodeGroups"][0]["autoscale"],
auto_scale_labels["auto_scaling_enabled"],
)
# min_node_count is hardcode to max(1, ng.min_node_count)
self.assertEqual(
helm_install_values["nodeGroups"][0]["machineCountMin"],
self.cluster_obj.nodegroups[0].min_node_count,
auto_scale_labels["min_node_count"],
)
self.assertEqual(
helm_install_values["nodeGroups"][0]["machineCountMax"],
@ -2821,3 +2821,163 @@ class ClusterAPIDriverTest(base.DbTestCase):
"machineCountMax",
helm_install_values["nodeGroups"][0],
)
@mock.patch.object(
driver.Driver,
"_storageclass_definitions",
return_value=mock.ANY,
)
@mock.patch.object(driver.Driver, "_validate_allowed_flavor")
@mock.patch.object(neutron, "get_network", autospec=True)
@mock.patch.object(
driver.Driver, "_ensure_certificate_secrets", autospec=True
)
@mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
@mock.patch.object(kubernetes.Client, "load", autospec=True)
@mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
@mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
def test_nodegroup_node_count_validation(
self,
mock_install,
mock_image,
mock_load,
mock_appcred,
mock_certs,
mock_get_net,
mock_validate_allowed_flavor,
mock_storageclasses,
):
auto_scale_labels = dict(
auto_scaling_enabled="true", min_node_count=2, max_node_count=6
)
self.cluster_obj.labels = auto_scale_labels
mock_image.return_value = (
"imageid1",
"1.27.4",
"ubuntu",
)
# Create cluster with extra node groups
ng = obj_utils.create_test_nodegroup(
self.context, min_node_count=3, max_node_count=2
)
self.cluster_obj.nodegroups.append(ng)
self.assertRaises(
exception.NodeGroupInvalidInput,
self.driver.create_cluster,
self.context,
self.cluster_obj,
"not-used",
)
@mock.patch.object(
driver.Driver,
"_storageclass_definitions",
return_value=mock.ANY,
)
@mock.patch.object(driver.Driver, "_validate_allowed_flavor")
@mock.patch.object(neutron, "get_network", autospec=True)
@mock.patch.object(
driver.Driver, "_ensure_certificate_secrets", autospec=True
)
@mock.patch.object(driver.Driver, "_create_appcred_secret", autospec=True)
@mock.patch.object(kubernetes.Client, "load", autospec=True)
@mock.patch.object(driver.Driver, "_get_image_details", autospec=True)
@mock.patch.object(helm.Client, "install_or_upgrade", autospec=True)
def test_create_extra_nodegroup_auto_scale_values(
self,
mock_install,
mock_image,
mock_load,
mock_appcred,
mock_certs,
mock_get_net,
mock_validate_allowed_flavor,
mock_storageclasses,
):
auto_scale_labels = dict(
auto_scaling_enabled="true", min_node_count=2, max_node_count=6
)
self.cluster_obj.labels = auto_scale_labels
mock_image.return_value = (
"imageid1",
"1.27.4",
"ubuntu",
)
# Create cluster with extra node groups
non_auto_scale_nodegroup = obj_utils.create_test_nodegroup(
self.context,
uuid=uuid4(),
id=123,
name="non-autoscale-group",
node_count=1,
)
self.cluster_obj.nodegroups.append(non_auto_scale_nodegroup)
auto_scale_nodegroup = obj_utils.create_test_nodegroup(
self.context,
uuid=uuid4(),
id=456,
name="autoscale-group",
node_count=1,
min_node_count=0,
max_node_count=3,
)
self.cluster_obj.nodegroups.append(auto_scale_nodegroup)
self.driver.create_cluster(self.context, self.cluster_obj, 10)
for ng in self.cluster_obj.nodegroups:
print(ng)
# Unpack some values for asserting against
helm_install_values = mock_install.call_args[0][3]
helm_node_groups = helm_install_values["nodeGroups"]
helm_values_default_nodegroup = [
ng
for ng in helm_node_groups
if ng["name"] == self.cluster_obj.default_ng_worker.name
]
helm_values_autoscale_nodegroup = [
ng
for ng in helm_node_groups
if ng["name"] == auto_scale_nodegroup.name
]
helm_values_non_autoscale_nodegroup = [
ng
for ng in helm_node_groups
if ng["name"] == non_auto_scale_nodegroup.name
]
# Check that node groups were passed into Helm values correctly
self.assertEqual(len(helm_values_default_nodegroup), 1)
self.assertEqual(len(helm_values_autoscale_nodegroup), 1)
self.assertEqual(len(helm_values_non_autoscale_nodegroup), 1)
# Check default node group values
ng = helm_values_default_nodegroup[0]
self.assertEqual(ng["autoscale"], "true")
self.assertEqual(
ng["machineCountMin"], auto_scale_labels["min_node_count"]
)
self.assertEqual(
ng["machineCountMax"], auto_scale_labels["max_node_count"]
)
# Check extra autoscaling node group values
# NOTE: CAPO doesn't support scale to zero so
# min node count should be max(1, ng.min_node_count)
ng = helm_values_autoscale_nodegroup[0]
self.assertEqual(ng["autoscale"], "true")
self.assertEqual(
ng["machineCountMin"], max(1, auto_scale_nodegroup.min_node_count)
)
self.assertEqual(
ng["machineCountMax"], auto_scale_nodegroup.max_node_count
)
# Check extra non-autoscaling node group values
ng = helm_values_non_autoscale_nodegroup[0]
self.assertEqual(ng.get("autoscale"), None)
self.assertEqual(ng.get("machineCountMin"), None)
self.assertEqual(ng.get("machineCountMax"), None)

View File

@ -0,0 +1,10 @@
---
features:
- |
Adds support for autoscaling of non-default worker node groups. The min/max
node count properties on each node group are passed to the autoscaler to
determine autoscaling behaviour. If the target cluster has the
`auto_scaling_enabled` cluster label set to `true` then any node groups with
both `min_node_count`` and `max_node_count` properties set will be allowed
to autoscale between these min and max node counts depending on the cluster's
current workload.