Fix 'secondary' and 'rehome-pending' subclouds stuck at 'online'

This commit makes 'rehome-pending' subclouds auditable while they are
still online (endpoint audits are still skipped, as the subcloud is
unmanaged). Secondary subclouds are still not audited but their
availability status will be automatically set to 'offline' when its
deploy status is set to 'secondary'.

In the original design, 'secondary' and 'rehome-pending' subclouds are
not supposed to be audited, this creates the issue where the subclouds
get stuck with the 'online' availability status, preventing the user
from being able to delete it.

This commit also fixes an issue where it was not possible to set the
endpoint status to 'unknown' for 'secondary' subclouds.

Test Plan:
1. PASS - Run 'dcmanager subcloud unmanage --migrate' for an online
          subcloud and verify that:
          - All endpoint statuses were set to 'unknown';
          - Subcloud was still audited, but each endpoint audit was
            skipped;
          - After turning off the subcloud, its availability-status
            changed to 'offline' and the audits started being skipped.
2. PASS - Manage back the rehome-pending subcloud, verifying that:
          - It initially becomes 'managed' while still 'offline';
          - Audit starts running again, eventually setting the
            availability-status back to 'online';
          - Endpoint statuses started becoming 'in-sync' again.
3. PASS - Set the subcloud to 'rehome-pending', and then set it to
          'secondary', verify that the subcloud becomes 'offline' and
          that audits are skipped (all endpoint status should be set to
          'unknown').

Closes-Bug: 2047439

Change-Id: Ia21faf469aacee6f70e5b4fe6471b019ae057e13
Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
This commit is contained in:
Gustavo Herzmann 2023-12-26 09:49:07 -03:00
parent e0a09c9860
commit 244df2ed78
3 changed files with 50 additions and 21 deletions

View File

@ -125,10 +125,13 @@ class SubcloudAuditWorkerManager(manager.Manager):
consts.DEPLOY_STATE_UPGRADE_ACTIVATED,
consts.DEPLOY_STATE_RESTORING,
consts.DEPLOY_STATE_RESTORE_PREP_FAILED,
consts.DEPLOY_STATE_RESTORE_FAILED]
consts.DEPLOY_STATE_RESTORE_FAILED,
consts.DEPLOY_STATE_REHOME_PENDING]
and not prestage.is_deploy_status_prestage(
subcloud.deploy_status)) or (
subcloud.deploy_status == consts.DEPLOY_STATE_INSTALLING and
(subcloud.deploy_status in [
consts.DEPLOY_STATE_INSTALLING,
consts.DEPLOY_STATE_REHOME_PENDING]) and
subcloud.availability_status == dccommon_consts.AVAILABILITY_OFFLINE):
LOG.debug("Skip subcloud %s audit, deploy_status: %s" %
(subcloud.name, subcloud.deploy_status))

View File

@ -2709,7 +2709,8 @@ class SubcloudManager(manager.Manager):
raise exceptions.BadRequest(resource="subcloud", msg=msg)
if (subcloud.availability_status !=
dccommon_consts.AVAILABILITY_ONLINE):
dccommon_consts.AVAILABILITY_ONLINE) and (
subcloud.deploy_status != consts.DEPLOY_STATE_REHOME_PENDING):
LOG.warning(f"Subcloud {subcloud.name} is not online")
raise exceptions.SubcloudNotOnline()
@ -2885,13 +2886,23 @@ class SubcloudManager(manager.Manager):
# set all endpoint statuses to unknown, except the dc-cert
# endpoint which continues to be audited for unmanaged
# subclouds
ignore_endpoints = [dccommon_consts.ENDPOINT_TYPE_DC_CERT]
# Do not ignore the dc-cert endpoint for secondary or rehome
# pending subclouds as cert-mon does not audit them
if subcloud.deploy_status in (
consts.DEPLOY_STATE_SECONDARY,
consts.DEPLOY_STATE_REHOME_PENDING
):
ignore_endpoints = None
self.state_rpc_client.update_subcloud_endpoint_status_sync(
context,
subcloud_name=subcloud.name,
subcloud_region=subcloud.region_name,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN,
ignore_endpoints=[dccommon_consts.ENDPOINT_TYPE_DC_CERT])
ignore_endpoints=ignore_endpoints)
elif management_state == dccommon_consts.MANAGEMENT_MANAGED:
# Subcloud is managed
# Tell cert-mon to audit endpoint certificate
@ -2899,16 +2910,15 @@ class SubcloudManager(manager.Manager):
dc_notification = dcmanager_rpc_client.DCManagerNotifications()
dc_notification.subcloud_managed(context, subcloud.region_name)
# Set all endpoint statuses to unknown, no endpoint
# will be audited for secondary or rehome-pending subclouds
if subcloud.deploy_status in (consts.DEPLOY_STATE_SECONDARY,
consts.DEPLOY_STATE_REHOME_PENDING):
self.state_rpc_client.update_subcloud_endpoint_status_sync(
# Request the state client to update the subcloud availability
# status to OFFLINE if subcloud is 'secondary'. The state
# service will set all endpoint statuses to 'unknown'.
if deploy_status == consts.DEPLOY_STATE_SECONDARY:
self.state_rpc_client.update_subcloud_availability(
context,
subcloud_name=subcloud.name,
subcloud_region=subcloud.region_name,
endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN)
subcloud.name,
subcloud.region_name,
dccommon_consts.AVAILABILITY_OFFLINE)
# Clear existing fault alarm of secondary subcloud
if subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY:

View File

@ -294,7 +294,8 @@ class SubcloudStateManager(manager.Manager):
# Rules for updating sync status:
#
# Skip audit any 'secondary' state subclouds
# For secondary subclouds, only update if the new sync_status is
# 'unknown'
#
# For others, always update if not in-sync.
#
@ -308,11 +309,22 @@ class SubcloudStateManager(manager.Manager):
# This means if a subcloud is going offline or unmanaged, then
# the sync status update must be done first.
#
if ((sync_status != dccommon_consts.SYNC_STATUS_IN_SYNC or
((subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE) and
(subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED
or endpoint_type == dccommon_consts.ENDPOINT_TYPE_DC_CERT))) and
subcloud.deploy_status != consts.DEPLOY_STATE_SECONDARY):
is_in_sync = sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC
is_online = subcloud.availability_status == \
dccommon_consts.AVAILABILITY_ONLINE
is_managed = subcloud.management_state == \
dccommon_consts.MANAGEMENT_MANAGED
is_endpoint_type_dc_cert = endpoint_type == \
dccommon_consts.ENDPOINT_TYPE_DC_CERT
is_secondary = subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY
is_sync_unknown = sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
is_secondary_and_sync_unknown = is_secondary and is_sync_unknown
if (
(not is_in_sync
or (is_online and (is_managed or is_endpoint_type_dc_cert)))
and not is_secondary
) or is_secondary_and_sync_unknown:
# update a single subcloud
try:
self._do_update_subcloud_endpoint_status(context,
@ -379,7 +391,8 @@ class SubcloudStateManager(manager.Manager):
'subcloud: %s' % subcloud_name)
def _raise_or_clear_subcloud_status_alarm(self, subcloud_name,
availability_status):
availability_status,
deploy_status=None):
entity_instance_id = "subcloud=%s" % subcloud_name
fault = self.fm_api.get_fault(
fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
@ -394,8 +407,11 @@ class SubcloudStateManager(manager.Manager):
LOG.exception("Failed to clear offline alarm for subcloud: %s",
subcloud_name)
# Raise the alarm if the subcloud became offline and it's not a
# secondary subcloud
elif not fault and \
(availability_status == dccommon_consts.AVAILABILITY_OFFLINE):
(availability_status == dccommon_consts.AVAILABILITY_OFFLINE and
deploy_status != consts.DEPLOY_STATE_SECONDARY):
try:
fault = fm_api.Fault(
alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,