Fix cluster deletion on trust creation failure

This change fixes an issue where clusters which fail early in the creation process (e.g. if the user autheticates with a restricted app cred then the trust creation fails) would then also fail to delete correctly. The issue is caused by attempting to delete the Helm release instance even when it was not created in the first place. In such cases the user would receive an error such as 'Error: uninstall: Release name is invalid: None'.

Change-Id: I0fe188f284f94f069b5116b3be882eebf1b80706
This commit is contained in:
sd109 2024-05-13 16:17:59 +01:00
parent 9e4ac320be
commit 57a9d43d6f
4 changed files with 34 additions and 20 deletions

View File

@ -336,10 +336,12 @@ class Driver(driver.Driver):
cluster.save()
def _get_capi_cluster(self, cluster):
return self._k8s_client.get_capi_cluster(
driver_utils.chart_release_name(cluster),
driver_utils.cluster_namespace(cluster),
)
release_name = driver_utils.chart_release_name(cluster)
if release_name:
return self._k8s_client.get_capi_cluster(
release_name,
driver_utils.cluster_namespace(cluster),
)
def _update_all_nodegroups_status(self, cluster):
"""Returns True if any node group still in progress."""
@ -373,14 +375,18 @@ class Driver(driver.Driver):
# TODO(mkjpryor) Add a timeout for create/update/delete
# Update the cluster API address if it is known
# so users can get their coe credentials
capi_cluster = self._get_capi_cluster(cluster)
self._update_cluster_api_address(cluster, capi_cluster)
# Update the nodegroups first
# to ensure API never returns an inconsistent state
nodegroups_in_progress = self._update_all_nodegroups_status(cluster)
if capi_cluster:
# Update the cluster API address if it is known
# so users can get their coe credentials
self._update_cluster_api_address(cluster, capi_cluster)
# Update the nodegroups first
# to ensure API never returns an inconsistent state
nodegroups_in_progress = self._update_all_nodegroups_status(
cluster
)
if cluster.status in {
fields.ClusterStatus.CREATE_IN_PROGRESS,
@ -927,14 +933,19 @@ class Driver(driver.Driver):
ng.status = fields.ClusterStatus.DELETE_IN_PROGRESS
ng.save()
# Begin the deletion of the cluster resources by uninstalling the
# Helm release
# Note that this just marks the resources for deletion - it does not
# wait for the resources to be deleted
self._helm_client.uninstall_release(
driver_utils.chart_release_name(cluster),
namespace=driver_utils.cluster_namespace(cluster),
)
release_name = driver_utils.chart_release_name(cluster)
# Only attempt deletion of CAPI resources if they were created in
# the first place e.g. if trust creation fails during cluster create
# then no CAPI resources will have been created.
if release_name:
# Begin the deletion of the cluster resources by uninstalling the
# Helm release.
# Note that this just marks the resources for deletion,
# it does not wait for the resources to be deleted.
self._helm_client.uninstall_release(
release_name,
namespace=driver_utils.cluster_namespace(cluster),
)
def resize_cluster(
self,

View File

@ -83,6 +83,7 @@ class Client:
version: t.Optional[str] = None,
) -> t.Iterable[t.Dict[str, t.Any]]:
"""Install or upgrade specified release using chart and values."""
assert release_name is not None
command = [
"upgrade",
release_name,
@ -117,6 +118,7 @@ class Client:
namespace: str,
):
"""Uninstall the named release."""
assert release_name is not None
command = [
"uninstall",
release_name,

View File

@ -216,6 +216,7 @@ class Resource:
If the object is not found, None is returned.
"""
assert self.namespaced == bool(namespace)
assert name is not None
response = self.client.get(self.prepare_path(name, namespace))
if 200 <= response.status_code < 300:
return response.json()

View File

@ -93,7 +93,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
self.driver.update_cluster_status(self.context, self.cluster_obj)
mock_ng.assert_called_once_with(self.cluster_obj)
mock_ng.assert_not_called()
mock_update.assert_not_called()
mock_delete.assert_not_called()
@ -126,7 +126,7 @@ class ClusterAPIDriverTest(base.DbTestCase):
self.driver.update_cluster_status(self.context, self.cluster_obj)
mock_ng.assert_called_once_with(self.cluster_obj)
mock_ng.assert_not_called()
mock_update.assert_not_called()
mock_delete.assert_called_once_with(self.context, self.cluster_obj)