diff --git a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py index b02ee364e..bb98d7508 100644 --- a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py +++ b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py @@ -10,6 +10,7 @@ from requests_toolbelt import MultipartEncoder from dccommon import consts from dccommon.drivers import base from dccommon import exceptions +from dcmanager.db.sqlalchemy import models LOG = log.getLogger(__name__) @@ -27,6 +28,7 @@ class DcmanagerClient(base.DriverBase): timeout=DCMANAGER_CLIENT_REST_DEFAULT_TIMEOUT, endpoint_type=consts.KS_ENDPOINT_PUBLIC, endpoint=None, + peer: models.SystemPeer = None, ): if endpoint is None: endpoint = session.get_endpoint( @@ -35,6 +37,7 @@ class DcmanagerClient(base.DriverBase): self.endpoint = endpoint self.timeout = timeout self.session = session + self.peer = peer def get_system_peer(self, system_peer_uuid): """Get system peer.""" diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index aa2cc994b..eca9ff868 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -1076,8 +1076,11 @@ class SubcloudsController(object): systemcontroller_gateway_address.split(",")[0] != subcloud.systemcontroller_gateway_ip ): + # Pass bootstrap_values_dict for patch operations where these + # values aren't in payload, unlike subcloud add where they are. + # Function needs management/admin_subnet for validation psd_common.validate_systemcontroller_gateway_address( - systemcontroller_gateway_address, payload + systemcontroller_gateway_address, bootstrap_values_dict ) management_state = payload.get("management-state") diff --git a/distributedcloud/dcmanager/db/api.py b/distributedcloud/dcmanager/db/api.py index da798e3a7..3e4ab0b88 100644 --- a/distributedcloud/dcmanager/db/api.py +++ b/distributedcloud/dcmanager/db/api.py @@ -541,27 +541,27 @@ def system_peer_create( ) -def system_peer_get(context, peer_id): +def system_peer_get(context, peer_id) -> models.SystemPeer: """Retrieve a system_peer or raise if it does not exist.""" return IMPL.system_peer_get(context, peer_id) -def system_peer_get_by_uuid(context, uuid): +def system_peer_get_by_uuid(context, uuid) -> models.SystemPeer: """Retrieve a system_peer by uuid or raise if it does not exist.""" return IMPL.system_peer_get_by_uuid(context, uuid) -def system_peer_get_by_name(context, uuid): +def system_peer_get_by_name(context, uuid) -> models.SystemPeer: """Retrieve a system_peer by name or raise if it does not exist.""" return IMPL.system_peer_get_by_name(context, uuid) -def system_peer_get_all(context): +def system_peer_get_all(context) -> list[models.SystemPeer]: """Retrieve all system peers.""" return IMPL.system_peer_get_all(context) -def peer_group_get_for_system_peer(context, peer_id): +def peer_group_get_for_system_peer(context, peer_id) -> list[models.SubcloudPeerGroup]: """Get subcloud peer groups associated with a system peer.""" return IMPL.peer_group_get_for_system_peer(context, peer_id) @@ -656,22 +656,24 @@ def subcloud_peer_group_destroy(context, group_id): return IMPL.subcloud_peer_group_destroy(context, group_id) -def subcloud_peer_group_get(context, group_id): +def subcloud_peer_group_get(context, group_id) -> models.SubcloudPeerGroup: """Retrieve a subcloud_peer_group or raise if it does not exist.""" return IMPL.subcloud_peer_group_get(context, group_id) -def subcloud_peer_group_get_by_name(context, name): +def subcloud_peer_group_get_by_name(context, name) -> models.SubcloudPeerGroup: """Retrieve a subcloud_peer_group by name or raise if it does not exist.""" return IMPL.subcloud_peer_group_get_by_name(context, name) -def subcloud_peer_group_get_by_leader_id(context, system_leader_id): +def subcloud_peer_group_get_by_leader_id( + context, system_leader_id +) -> list[models.SubcloudPeerGroup]: """Retrieve subcloud peer groups by system_leader_id.""" return IMPL.subcloud_peer_group_get_by_leader_id(context, system_leader_id) -def subcloud_get_for_peer_group(context, group_id): +def subcloud_get_for_peer_group(context, group_id) -> list[models.Subcloud]: """Retrieve all subclouds belonging to a subcloud_peer_group or raise if it does not exist. @@ -679,7 +681,7 @@ def subcloud_get_for_peer_group(context, group_id): return IMPL.subcloud_get_for_peer_group(context, group_id) -def subcloud_peer_group_get_all(context): +def subcloud_peer_group_get_all(context) -> list[models.SubcloudPeerGroup]: """Retrieve all subcloud peer groups.""" return IMPL.subcloud_peer_group_get_all(context) @@ -765,31 +767,35 @@ def peer_group_association_destroy(context, id): return IMPL.peer_group_association_destroy(context, id) -def peer_group_association_get(context, id): +def peer_group_association_get(context, id) -> models.PeerGroupAssociation: """Retrieve a peer_group_association or raise if it does not exist.""" return IMPL.peer_group_association_get(context, id) -def peer_group_association_get_all(context): +def peer_group_association_get_all(context) -> list[models.PeerGroupAssociation]: """Retrieve all peer_group_associations.""" return IMPL.peer_group_association_get_all(context) def peer_group_association_get_by_peer_group_and_system_peer_id( context, peer_group_id, system_peer_id -): +) -> list[models.PeerGroupAssociation]: """Get peer group associations by peer_group_id and system_peer_id.""" return IMPL.peer_group_association_get_by_peer_group_and_system_peer_id( context, peer_group_id, system_peer_id ) -def peer_group_association_get_by_peer_group_id(context, peer_group_id): +def peer_group_association_get_by_peer_group_id( + context, peer_group_id +) -> list[models.PeerGroupAssociation]: """Get the peer_group_association list by peer_group_id""" return IMPL.peer_group_association_get_by_peer_group_id(context, peer_group_id) -def peer_group_association_get_by_system_peer_id(context, system_peer_id): +def peer_group_association_get_by_system_peer_id( + context, system_peer_id +) -> list[models.PeerGroupAssociation]: """Get the peer_group_association list by system_peer_id""" return IMPL.peer_group_association_get_by_system_peer_id(context, system_peer_id) diff --git a/distributedcloud/dcmanager/db/sqlalchemy/api.py b/distributedcloud/dcmanager/db/sqlalchemy/api.py index 39f40a35d..46dfe1a3e 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/api.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/api.py @@ -1647,7 +1647,7 @@ def peer_group_association_destroy(context, association_id): @require_context -def peer_group_association_get(context, association_id): +def peer_group_association_get(context, association_id) -> models.PeerGroupAssociation: try: result = ( model_query(context, models.PeerGroupAssociation) @@ -1666,7 +1666,7 @@ def peer_group_association_get(context, association_id): @require_context -def peer_group_association_get_all(context): +def peer_group_association_get_all(context) -> list[models.PeerGroupAssociation]: result = ( model_query(context, models.PeerGroupAssociation) .filter_by(deleted=0) @@ -1682,7 +1682,7 @@ def peer_group_association_get_all(context): @require_context def peer_group_association_get_by_peer_group_and_system_peer_id( context, peer_group_id, system_peer_id -): +) -> models.PeerGroupAssociation: try: result = ( model_query(context, models.PeerGroupAssociation) @@ -1705,7 +1705,9 @@ def peer_group_association_get_by_peer_group_and_system_peer_id( @require_context -def peer_group_association_get_by_peer_group_id(context, peer_group_id): +def peer_group_association_get_by_peer_group_id( + context, peer_group_id +) -> models.PeerGroupAssociation: result = ( model_query(context, models.PeerGroupAssociation) .filter_by(deleted=0) @@ -1718,7 +1720,9 @@ def peer_group_association_get_by_peer_group_id(context, peer_group_id): @require_context -def peer_group_association_get_by_system_peer_id(context, system_peer_id): +def peer_group_association_get_by_system_peer_id( + context, system_peer_id +) -> models.PeerGroupAssociation: result = ( model_query(context, models.PeerGroupAssociation) .filter_by(deleted=0) diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py index c312218f8..fb67ae8a6 100644 --- a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py +++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py @@ -3,8 +3,10 @@ # # SPDX-License-Identifier: Apache-2.0 # - +from __future__ import annotations +import json import threading +from typing import TYPE_CHECKING from fm_api import constants as fm_const from fm_api import fm_api @@ -12,14 +14,20 @@ from oslo_config import cfg from oslo_log import log as logging from dccommon import consts as dccommon_consts +from dccommon.drivers.openstack.dcmanager_v1 import DcmanagerClient from dcmanager.common import consts from dcmanager.common import context from dcmanager.common.i18n import _ from dcmanager.common import manager from dcmanager.common import utils from dcmanager.db import api as db_api +from dcmanager.db.sqlalchemy import models from dcmanager.manager.system_peer_manager import SystemPeerManager +# Use TYPE_CHECKING to avoid circular import +if TYPE_CHECKING: + from dcmanager.manager.subcloud_manager import SubcloudManager + CONF = cfg.CONF LOG = logging.getLogger(__name__) @@ -27,7 +35,9 @@ LOG = logging.getLogger(__name__) class PeerGroupAuditManager(manager.Manager): """Manages audit related tasks.""" - def __init__(self, subcloud_manager, peer_group_id, *args, **kwargs): + def __init__( + self, subcloud_manager: SubcloudManager, peer_group_id: int, *args, **kwargs + ): LOG.debug(_("PeerGroupAuditManager initialization...")) super().__init__(service_name="peer_group_audit_manager", *args, **kwargs) self.context = context.get_admin_context() @@ -52,8 +62,8 @@ class PeerGroupAuditManager(manager.Manager): @staticmethod def _get_association_sync_status_from_peer_site( - dc_client, system_peer, peer_group_id - ): + dc_client: DcmanagerClient, system_peer: models.SystemPeer, peer_group_id: int + ) -> str: try: # Get peer site system peer dc_peer_system_peer = dc_client.get_system_peer( @@ -81,8 +91,11 @@ class PeerGroupAuditManager(manager.Manager): ) def _get_local_subclouds_to_update_and_delete( - self, local_peer_group, remote_subclouds, remote_sync_status - ): + self, + local_peer_group: models.SubcloudPeerGroup, + remote_subclouds: list[dict], + remote_sync_status: str, + ) -> tuple[list[models.Subcloud], list[models.Subcloud], bool]: local_subclouds_to_update = list() local_subclouds_to_delete = list() any_rehome_failed = False @@ -112,8 +125,13 @@ class PeerGroupAuditManager(manager.Manager): # indicating any bootstrap values/address updates to # the subcloud on the remote site. if remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: + LOG.info( + "Peer association is out-of-sync, syncing rehome " + f"data of subcloud '{local_subcloud.name}' from " + "peer to current site" + ) self._sync_rehome_data( - local_subcloud.id, remote_subcloud.get("rehome_data") + local_subcloud, remote_subcloud.get("rehome_data") ) elif remote_subcloud.get("deploy-status") in ( consts.DEPLOY_STATE_REHOME_FAILED, @@ -134,7 +152,7 @@ class PeerGroupAuditManager(manager.Manager): return local_subclouds_to_update, local_subclouds_to_delete, any_rehome_failed - def _set_local_subcloud_to_secondary(self, subcloud): + def _set_local_subcloud_to_secondary(self, subcloud: models.Subcloud) -> None: try: LOG.info("Set local subcloud %s to secondary" % subcloud.name) # There will be an exception when unmanage @@ -155,18 +173,42 @@ class PeerGroupAuditManager(manager.Manager): ) raise e - def _sync_rehome_data(self, subcloud_id, rehome_data): - db_api.subcloud_update(self.context, subcloud_id, rehome_data=rehome_data) + def _sync_rehome_data(self, subcloud: models.Subcloud, rehome_data: str) -> None: + try: + remote_rehome_data = json.loads(rehome_data) + local_rehome_data = json.loads(subcloud.rehome_data) - def audit(self, system_peer, remote_peer_group, local_peer_group): + # The systemcontroller_gateway_address can't be synced from the + # peer to the local site as it's specific to each site + remote_rehome_data["saved_payload"]["systemcontroller_gateway_address"] = ( + local_rehome_data["saved_payload"]["systemcontroller_gateway_address"] + ) + new_rehome_data = json.dumps(remote_rehome_data) + + db_api.subcloud_update( + self.context, subcloud.id, rehome_data=new_rehome_data + ) + except Exception as e: + LOG.error( + "Unable to sync rehome data of subcloud " + f"'{subcloud.name}' from peer to current site: {str(e)}" + ) + raise + + def audit( + self, + system_peer: models.SystemPeer, + remote_peer_group: dict, + local_peer_group: models.SubcloudPeerGroup, + ) -> None: if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING: LOG.info("Local peer group in migrating state, quit audit") return LOG.info( - "Auditing remote subcloud peer group:[%s] migration_status:[%s] " - "group_priority[%s], local subcloud peer group:[%s] " - "migration_status:[%s] group_priority[%s]" + "Auditing remote subcloud peer group: [%s], migration_status: [%s], " + "group_priority: [%s], local subcloud peer group: [%s], " + "migration_status: [%s], group_priority: [%s]" % ( remote_peer_group.get("peer_group_name"), remote_peer_group.get("migration_status"), @@ -277,8 +319,8 @@ class PeerGroupAuditManager(manager.Manager): ) LOG.exception( f"Failed to delete local subcloud [{subcloud.name}] that does " - "not exist under the same subcloud_peer_group on peer site, " - f"err: {e}" + "not exist under the same subcloud_peer_group on peer site " + f"{system_peer.peer_name}, err: {e}" ) raise e @@ -313,7 +355,12 @@ class PeerGroupAuditManager(manager.Manager): # If remote peer group migration_status is 'None' self.require_audit_flag = False - def _clear_or_raise_alarm(self, system_peer, local_peer_group, remote_peer_group): + def _clear_or_raise_alarm( + self, + system_peer: models.SystemPeer, + local_peer_group: models.SubcloudPeerGroup, + remote_peer_group: dict, + ) -> None: # If local subcloud peer group's group_priority is # lower than remote subcloud peer group's group_priority, # an alarm will be raised. @@ -325,7 +372,7 @@ class PeerGroupAuditManager(manager.Manager): if local_peer_group.group_priority < remote_peer_group.get("group_priority"): LOG.warning( f"Alarm: local subcloud peer group [{local_peer_group.peer_group_name}]" - f" is managed by remote system [{system_peer.peer_name}]" + f" is managed by remote system peer [{system_peer.peer_name}]" ) try: fault = fm_api.Fault( @@ -336,7 +383,7 @@ class PeerGroupAuditManager(manager.Manager): severity=fm_const.FM_ALARM_SEVERITY_MAJOR, reason_text=( "Subcloud peer group (peer_group_name=%s) is managed by " - "remote system (peer_uuid=%s) with a lower priority." + "remote system peer (peer_uuid=%s) with a lower priority." % (local_peer_group.peer_group_name, system_peer.peer_uuid) ), alarm_type=fm_const.FM_ALARM_TYPE_0, @@ -375,12 +422,12 @@ class PeerGroupAuditManager(manager.Manager): try: self.audit(system_peer, remote_peer_group, local_peer_group) except Exception as e: - LOG.exception("audit error occured: %s" % e) + LOG.exception("audit error occurred: %s" % e) def stop(self): if self.thread: self.thread.join() - LOG.info(f"stopped peer group {self.peer_group_id} audit thread") + LOG.info(f"Stopped peer group {self.peer_group_id} audit thread") else: LOG.info(f"No peer group {self.peer_group_id} audit thread to stop") @@ -402,12 +449,14 @@ class PeerGroupAuditManager(manager.Manager): ): LOG.info( f"Audit peer group [{local_peer_group.peer_group_name}] " - f"with remote system {system_peer.peer_name}" + f"with remote system peer {system_peer.peer_name}" ) self.start(system_peer, remote_peer_group, local_peer_group) @staticmethod - def send_audit_peer_group(system_peers, peer_group): + def send_audit_peer_group( + system_peers: list[models.SystemPeer], peer_group: models.SubcloudPeerGroup + ): if not system_peers: return local_system = utils.get_local_system() diff --git a/distributedcloud/dcmanager/manager/peer_monitor_manager.py b/distributedcloud/dcmanager/manager/peer_monitor_manager.py index 9f698f94e..437fb6056 100644 --- a/distributedcloud/dcmanager/manager/peer_monitor_manager.py +++ b/distributedcloud/dcmanager/manager/peer_monitor_manager.py @@ -9,6 +9,7 @@ import threading from fm_api import constants as fm_const from fm_api import fm_api +from keystoneauth1 import exceptions as ks_exceptions from oslo_config import cfg from oslo_log import log as logging @@ -16,15 +17,58 @@ from dcmanager.common import consts from dcmanager.common import context from dcmanager.common import manager from dcmanager.db import api as db_api +from dcmanager.db.sqlalchemy import models from dcmanager.manager import peer_group_audit_manager as pgam +from dcmanager.manager.subcloud_manager import SubcloudManager from dcmanager.manager.system_peer_manager import SystemPeerManager CONF = cfg.CONF LOG = logging.getLogger(__name__) +# Upon detecting that the secondary site is REACHABLE again, the PGA sync_status +# will be set for BOTH sites by the primary site monitor thread as follows: +SECONDARY_SITE_REACHABLE_TRANSITIONS = { + # The UNKNOWN and FAILED are set when the secondary site becomes unreachable + consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, + consts.ASSOCIATION_SYNC_STATUS_FAILED: consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC, + # The SYNCING and IN_SYNC status can happen when first creating the peer and the + # association. The association creation will set the status to SYNCING/IN_SYNC, + # and the monitor thread might detect the peer availability after that, so we + # don't modify the status + consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, + consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_SYNCING, + # A similar situation can happen with OUT_OF_SYNC, if the association/peer group + # is updated before the monitor thread runs its first check, so we don't modify + # the status + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: ( + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC + ), +} + +# Upon detecting that the secondary site is UNREACHABLE, the PGA sync_status +# will be set for the PIRMARY SITE ONLY by the primary site monitor thread as follows: +SECONDARY_SITE_UNREACHABLE_TRANSITIONS = { + # After the secondary site becomes unreachable, the only valid sync statuses + # are UNKNOWN or FAILED + consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, + consts.ASSOCIATION_SYNC_STATUS_FAILED: consts.ASSOCIATION_SYNC_STATUS_FAILED, + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: consts.ASSOCIATION_SYNC_STATUS_FAILED, + consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, + consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_FAILED, +} + class PeerMonitor(object): - def __init__(self, peer, context, subcloud_manager): + """Monitors and manages the connection status of a single DC system peer + + The monitor runs in a separate thread and periodically checks the peer's + availability using heartbeat mechanisms. It handles both failure and recovery + scenarios, updating system states and managing fault alarms accordingly. + """ + + def __init__( + self, peer: models.SystemPeer, context, subcloud_manager: SubcloudManager + ): self.peer = peer self.thread = None self.exit_flag = threading.Event() @@ -32,21 +76,22 @@ class PeerMonitor(object): self.context = context self.subcloud_manager = subcloud_manager self.peer_group_id_set = set() + self.failure_count = 0 # key: peer_group_id # value: PeerGroupAuditManager object - self.peer_group_audit_obj_map = dict() + self.peer_group_audit_obj_map: dict[int, pgam.PeerGroupAuditManager] = dict() def _clear_failure(self): alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED - entity_instance_id = "peer=%s" % self.peer.peer_uuid + entity_instance_id = f"peer={self.peer.peer_uuid}" try: fault = self.fm_api.get_fault(alarm_id, entity_instance_id) if fault: self.fm_api.clear_fault(alarm_id, entity_instance_id) except Exception as e: LOG.exception( - "Problem clearing fault for peer %s, alarm_id=%s error: %s" - % (self.peer.peer_uuid, alarm_id, e) + f"Problem clearing fault for peer {self.peer.peer_name}, " + f"alarm_id={alarm_id} error: {str(e)}" ) def _raise_failure(self): @@ -114,124 +159,165 @@ class PeerMonitor(object): if not dc_peer_subcloud_peer_group_list: LOG.warning( - "Resource subcloud peer group of dc:%s not found" - % self.peer.manager_endpoint + f"No subcloud peer groups found for DC peer: {self.peer.peer_name} " + f"(endpoint: {self.peer.manager_endpoint})" ) + except (ks_exceptions.ConnectFailure, ks_exceptions.ConnectTimeout) as e: + # Use warning level for common connection failures to reduce log spam + LOG.warning(f"Failed to access DC peer {self.peer.peer_name}: {str(e)}") except Exception: - LOG.exception("Failed to access the dc: %s" % self.peer.peer_name) + LOG.exception( + f"Unexpected error while accessing DC peer {self.peer.peer_name}" + ) + return failed, dc_peer_subcloud_peer_group_list - def _update_sync_status_secondary_site_becomes_unreachable(self): + def _update_sync_status_secondary_site_becomes_unreachable(self) -> None: + """Update sync status on local site when secondary site becomes unreachable.""" # Get associations by system peer associations = SystemPeerManager.get_local_associations(self.context, self.peer) for association in associations: # If the association is not primary, skip it. if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: - LOG.debug( - "Skip update the Association sync_status as it is not primary." + LOG.info( + f"Skip updating local association (id={association.id}) " + "sync_status as it's not primary" ) continue - # If the secondary site is down, set the association sync status - # "in-sync" -> "unknown" - # "unknown" -> "unknown" - # "out-of-sync" -> "failed" - # "syncing" -> "failed" - # "failed" -> "failed" - sync_status = consts.ASSOCIATION_SYNC_STATUS_UNKNOWN - message = f"Peer site ({self.peer.peer_name}) is unreachable." - if association.sync_status not in [ - consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, - consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, - ]: - sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED + + try: + new_sync_status = SECONDARY_SITE_UNREACHABLE_TRANSITIONS[ + association.sync_status + ] + except KeyError: + # This should never happen + LOG.error( + f"Unexpected sync status on association id={association.id}: " + f"{association.sync_status}. Updating the sync status to " + f"{consts.ASSOCIATION_SYNC_STATUS_FAILED}" + ) + new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED + + LOG.info( + f"Updating local association (id={association.id}) sync_status " + f"from {association.sync_status} to {new_sync_status} due " + "to secondary site becoming unreachable" + ) + db_api.peer_group_association_update( self.context, association.id, - sync_status=sync_status, - sync_message=message, + sync_status=new_sync_status, + sync_message=f"Peer site ({self.peer.peer_name}) is unreachable.", ) - def _update_sync_status_secondary_site_becomes_reachable(self): + def _update_sync_status_secondary_site_becomes_reachable(self) -> None: + """Update sync status on both sites when secondary site becomes reachable.""" # Get associations by system peer associations = SystemPeerManager.get_local_associations(self.context, self.peer) for association in associations: # If the association is not primary, skip it. if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: - LOG.debug( - "Skip update Peer Site Association sync_status as " - "current site Association is not primary." + LOG.info( + "Skip updating association sync_status on both sites as " + "current site association is not primary." ) continue - # Upon detecting that the secondary site is reachable again, - # the PGA sync_status will be set for both sites by the primary - # site monitor thread as follows: - # "unknown" -> "in-sync" - # "failed" -> "out-of-sync" - sync_status = consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC - if association.sync_status == consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: - sync_status = consts.ASSOCIATION_SYNC_STATUS_IN_SYNC + + try: + new_sync_status = SECONDARY_SITE_REACHABLE_TRANSITIONS[ + association.sync_status + ] + except KeyError: + # This should never happen + LOG.error( + f"Unexpected sync status on association id={association.id}: " + f"{association.sync_status}. Updating the sync status to " + f"{consts.ASSOCIATION_SYNC_STATUS_FAILED}" + ) + new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED + dc_local_pg = db_api.subcloud_peer_group_get( self.context, association.peer_group_id ) + + LOG.info( + f"Updating local association (id={association.id}) sync_status " + f"from {association.sync_status} to {new_sync_status} due " + "to secondary site becoming reachable" + ) + SystemPeerManager.update_sync_status( self.context, self.peer, - sync_status, + new_sync_status, dc_local_pg, association=association, ) - def _do_monitor_peer(self): - failure_count = 0 - LOG.info("Start monitoring thread for peer %s" % self.peer.peer_name) - UNAVAILABLE_STATE = consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE - AVAILABLE_STATE = consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE - # Do the actual peer monitor. + def _update_peer_state(self, state: str) -> models.SystemPeer: + """Update peer availability state in database.""" + return db_api.system_peer_update( + self.context, self.peer.id, availability_state=state + ) + + def _handle_peer_failure(self) -> None: + """Handle peer failure by raising alarms and updating states.""" + self.failure_count += 1 + if self.failure_count >= self.peer.heartbeat_failure_threshold: + LOG.warning( + f"DC peer '{self.peer.peer_name}' heartbeat failed, raising alarm" + ) + self._raise_failure() + self._update_peer_state(consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE) + self._update_sync_status_secondary_site_becomes_unreachable() + self.failure_count = 0 + self._set_require_audit_flag_to_associated_peer_groups() + + def _handle_peer_recovery(self, remote_pg_list: list) -> None: + """Handle peer recovery by clearing alarms and restoring states.""" + self.failure_count = 0 + self._audit_local_peer_groups(remote_pg_list) + if ( + self.peer.availability_state + != consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE + ): + LOG.info(f"DC peer '{self.peer.peer_name}' is back online, clearing alarm") + self._update_peer_state(consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE) + self._update_sync_status_secondary_site_becomes_reachable() + self._clear_failure() + + def _check_peer_status(self) -> None: + """Perform a single peer status check and handle the result.""" + self.peer = db_api.system_peer_get(self.context, self.peer.id) + failed, remote_pg_list = self._heartbeat_check_via_get_peer_group_list() + + if failed: + self._handle_peer_failure() + else: + self._handle_peer_recovery(remote_pg_list) + + def _do_monitor_peer(self) -> None: + """Monitor peer connectivity and handle state changes.""" + LOG.info(f"Start monitoring thread for peer '{self.peer.peer_name}'") + + # Run first check immediately + try: + self._check_peer_status() + except Exception as e: + LOG.exception(f"Initial check failed for peer '{self.peer.peer_name}': {e}") + + # Continue with periodic checks while not self.exit_flag.wait(timeout=self.peer.heartbeat_interval): try: - # Get system peer from DB - self.peer = db_api.system_peer_get(self.context, self.peer.id) - failed, remote_pg_list = self._heartbeat_check_via_get_peer_group_list() - if failed: - failure_count += 1 - if failure_count >= self.peer.heartbeat_failure_threshold: - # heartbeat_failure_threshold reached. - LOG.warning( - "DC %s heartbeat failed, Raising alarm" - % self.peer.peer_name - ) - self._raise_failure() - db_api.system_peer_update( - self.context, - self.peer.id, - availability_state=UNAVAILABLE_STATE, - ) - # pylint: disable=line-too-long - self._update_sync_status_secondary_site_becomes_unreachable() - failure_count = 0 - self._set_require_audit_flag_to_associated_peer_groups() - else: - failure_count = 0 - self._audit_local_peer_groups(remote_pg_list) - if self.peer.availability_state != AVAILABLE_STATE: - db_api.system_peer_update( - self.context, - self.peer.id, - availability_state=AVAILABLE_STATE, - ) - # pylint: disable=line-too-long - self._update_sync_status_secondary_site_becomes_reachable() - LOG.info("DC %s back online, clear alarm" % self.peer.peer_name) - self._clear_failure() + self._check_peer_status() except Exception as e: LOG.exception( - "Got exception monitoring peer %s error: %s" - % (self.peer.peer_name, e) + f"Unexpected error monitoring peer '{self.peer.peer_name}': {e}" ) - LOG.info( - "Caught graceful exit signal for peer monitor %s" % self.peer.peer_name - ) + + LOG.info(f"Gracefully exiting monitor thread for peer '{self.peer.peer_name}'") def _audit_local_peer_groups(self, remote_pg_list): # Generate a dict index by remote peer group name @@ -266,7 +352,9 @@ class PeerMonitor(object): for pgam_obj in self.peer_group_audit_obj_map.values(): pgam_obj.require_audit_flag = True - def audit_specific_local_peer_group(self, peer_group, remote_peer_group): + def audit_specific_local_peer_group( + self, peer_group: models.SubcloudPeerGroup, remote_peer_group + ): msg = None if peer_group.id in self.peer_group_audit_obj_map: pgam_obj = self.peer_group_audit_obj_map[peer_group.id] @@ -324,17 +412,16 @@ class PeerMonitor(object): class PeerMonitorManager(manager.Manager): """Manages tasks related to peer monitor.""" - def __init__(self, subcloud_manager): + def __init__(self, subcloud_manager: SubcloudManager): LOG.debug("PeerMonitorManager initialization...") super(PeerMonitorManager, self).__init__(service_name="peer_monitor_manager") - self.peer_monitor = dict() self.context = context.get_admin_context() self.subcloud_manager = subcloud_manager # key: system_peer_id # value: PeerMonitor object - self.peer_monitor_thread_map = dict() + self.peer_monitor_thread_map: dict[int, PeerMonitor] = dict() def _remove_peer_monitor_task(self, system_peer_id): peer_mon_obj = self.peer_monitor_thread_map[system_peer_id] diff --git a/distributedcloud/dcmanager/manager/subcloud_manager.py b/distributedcloud/dcmanager/manager/subcloud_manager.py index 09dc481ce..822fd6c33 100644 --- a/distributedcloud/dcmanager/manager/subcloud_manager.py +++ b/distributedcloud/dcmanager/manager/subcloud_manager.py @@ -3674,6 +3674,11 @@ class SubcloudManager(manager.Manager): and systemcontroller_gateway_ip.split(",")[0] != subcloud.systemcontroller_gateway_ip ): + LOG.info( + f"The systemcontroller_gateway_ip for subcloud {subcloud.name} " + f"was updated from {subcloud.systemcontroller_gateway_ip} to " + f"{systemcontroller_gateway_ip.split(',')[0]}. Replacing routes..." + ) m_ks_client = OpenStackDriver( region_name=dccommon_consts.DEFAULT_REGION_NAME, region_clients=None, diff --git a/distributedcloud/dcmanager/manager/system_peer_manager.py b/distributedcloud/dcmanager/manager/system_peer_manager.py index 0270d52de..eb212f6e5 100644 --- a/distributedcloud/dcmanager/manager/system_peer_manager.py +++ b/distributedcloud/dcmanager/manager/system_peer_manager.py @@ -9,6 +9,7 @@ from contextlib import nullcontext import functools import json import tempfile +from typing import Optional from eventlet import greenpool from oslo_log import log as logging @@ -54,7 +55,9 @@ class SystemPeerManager(manager.Manager): ) @staticmethod - def get_local_associations(ctx, peer, local_pg=None): + def get_local_associations( + ctx, peer: models.SystemPeer, local_pg: models.SubcloudPeerGroup = None + ) -> list[models.PeerGroupAssociation]: if local_pg is None: # Get associations by system peer id return db_api.peer_group_association_get_by_system_peer_id(ctx, peer.id) @@ -93,8 +96,12 @@ class SystemPeerManager(manager.Manager): """ def _update_association_on_peer_site( - peer, sync_status, local_pg, remote_pg, message - ): + peer: models.SystemPeer, + sync_status: str, + local_pg: models.SubcloudPeerGroup, + remote_pg: dict, + message: str, + ) -> tuple[str, str]: try: # Get peer site dcmanager client dc_client = SystemPeerManager.get_peer_dc_client(peer) @@ -118,21 +125,21 @@ class SystemPeerManager(manager.Manager): # Update peer site association sync_status only if the # sync_status is different from the current sync_status - if dc_peer_association.get("sync_status") != sync_status: + if dc_peer_association.get("sync-status") != sync_status: # Update peer site association sync_status dc_peer_association_id = dc_peer_association.get("id") dc_client.update_peer_group_association_sync_status( dc_peer_association_id, sync_status ) LOG.info( - f"Updated Peer site {dc_peer_system_peer.get('id')} " - f"Peer Group Association {dc_peer_association_id} " - f"sync_status to {sync_status}." + f"Updated peer site '{dc_peer_system_peer.get('peer-name')}' " + f"peer group association {dc_peer_association_id} " + f"sync_status to '{sync_status}'" ) except Exception as e: message = ( - f"Failed to Update Peer Site ({peer.peer_uuid}) " - f"Association sync_status to {sync_status}." + f"Failed to update peer site '{peer.peer_name}' " + f"association sync_status to '{sync_status}'" ) LOG.exception(f"{message} Error: {e}") sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED @@ -146,9 +153,9 @@ class SystemPeerManager(manager.Manager): for association in associations: if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: - LOG.debug( - f"Skip update Peer Site association sync_status to {sync_status} " - "as current site Association is not primary." + LOG.info( + "Skipping updating peer and local site association " + f"sync_status to '{sync_status}' as local site is not primary" ) continue @@ -165,13 +172,16 @@ class SystemPeerManager(manager.Manager): association.sync_status == sync_status and sync_status != consts.ASSOCIATION_SYNC_STATUS_FAILED ): - LOG.debug( - "Skip update current site association sync_status to " - f"{sync_status} as current site Association is already " - "in the same status." + LOG.info( + f"Local association {association.id} sync_status " + f"already set to '{sync_status}', skipping update" ) continue # Update primary site association sync_status + LOG.info( + f"Updating local association {association.id} sync_status " + f"from '{association.sync_status}' to '{sync_status}'" + ) db_api.peer_group_association_update( ctx, association.id, sync_status=sync_status, sync_message=message ) @@ -222,10 +232,11 @@ class SystemPeerManager(manager.Manager): dccommon_consts.SYSTEM_CONTROLLER_NAME, p_ks_client.session, endpoint=dc_endpoint, + peer=peer, ) @staticmethod - def get_peer_subcloud(dc_client, subcloud_name): + def get_peer_subcloud(dc_client: DcmanagerClient, subcloud_name: str) -> dict: """Get subcloud on peer site if exist. :param dc_client: the dcmanager client object @@ -235,7 +246,10 @@ class SystemPeerManager(manager.Manager): peer_subcloud = dc_client.get_subcloud(subcloud_name) return peer_subcloud except dccommon_exceptions.SubcloudNotFound: - LOG.warn(f"Subcloud {subcloud_name} does not exist on peer site.") + LOG.warn( + f"Subcloud '{subcloud_name}' does not exist on " + f"peer site '{dc_client.peer.peer_name}'" + ) @staticmethod def get_subcloud_deploy_status(subcloud): @@ -258,7 +272,9 @@ class SystemPeerManager(manager.Manager): return True @staticmethod - def delete_peer_secondary_subcloud(dc_client, subcloud_ref): + def delete_peer_secondary_subcloud( + dc_client: DcmanagerClient, subcloud_ref: str + ) -> None: """Delete secondary subcloud on peer site. :param dc_client: the dcmanager client object @@ -284,14 +300,17 @@ class SystemPeerManager(manager.Manager): return dc_client.delete_subcloud(subcloud_ref) - LOG.info(f"Deleted Subcloud {subcloud_ref} on peer site.") + LOG.info( + f"Deleted subcloud '{subcloud_ref}' on peer site " + f"'{dc_client.peer.peer_name}'" + ) @staticmethod def _run_parallel_group_operation(op_type, op_function, thread_pool, subclouds): """Run parallel group operation on subclouds.""" failed_subclouds = [] processed = 0 - error_msg = {} # Dictinary to store error message for each subcloud + error_msg = {} # Dictionary to store error message for each subcloud for subcloud, success in thread_pool.imap(op_function, subclouds): processed += 1 @@ -365,9 +384,9 @@ class SystemPeerManager(manager.Manager): region_name, files, data, is_region_name=True ) LOG.info( - f"Updated Subcloud {dc_peer_subcloud.get('name')} " + f"Updated subcloud '{dc_peer_subcloud.get('name')}' " f"(region_name: {dc_peer_subcloud.get('region-name')}) " - "on peer site." + f"on peer site '{dc_client.peer.peer_name}'" ) else: # Create subcloud on peer site if not exist @@ -375,13 +394,15 @@ class SystemPeerManager(manager.Manager): files, data ) LOG.info( - f"Created Subcloud {dc_peer_subcloud.get('name')} " + f"Created subcloud '{dc_peer_subcloud.get('name')}' " f"(region_name: {dc_peer_subcloud.get('region-name')}) " - "on peer site." + f"on peer site '{dc_client.peer.peer_name}'" ) - LOG.debug( - f"Updating subcloud {subcloud_name} (region_name: {region_name}) " - f"with subcloud peer group id {dc_peer_pg_id} on peer site." + LOG.info( + f"Updating subcloud '{subcloud_name}' (region_name: " + f"{dc_peer_subcloud.get('region-name')}) with subcloud " + f"peer group id {dc_peer_pg_id} on peer site " + f"'{dc_client.peer.peer_name}'" ) # Update subcloud associated peer group on peer site. # The peer_group update will check the header and should @@ -413,12 +434,15 @@ class SystemPeerManager(manager.Manager): except Exception as e: subcloud.msg = str(e) # Store error message for subcloud LOG.error( - f"Failed to add/update Subcloud {subcloud_name} " - f"(region_name: {region_name}) on peer site: {str(e)}" + f"Failed to add/update subcloud '{subcloud_name}' " + f"(region_name: {region_name}) on peer site " + f"'{dc_client.peer.peer_name}': {str(e)}" ) return subcloud, False - def _delete_subcloud(self, dc_client, subcloud): + def _delete_subcloud( + self, dc_client: DcmanagerClient, subcloud: models.Subcloud + ) -> tuple[models.Subcloud, bool]: """Delete subcloud on peer site in parallel.""" try: subcloud_name = subcloud.get("name") @@ -427,11 +451,12 @@ class SystemPeerManager(manager.Manager): except Exception as e: subcloud.msg = str(e) LOG.exception( - f"Failed to delete Subcloud {subcloud_name} on peer site: {str(e)}" + f"Failed to delete subcloud '{subcloud_name}' on " + f"peer site '{dc_client.peer.peer_name}': {str(e)}" ) return subcloud, False - def _is_valid_for_subcloud_sync(self, subcloud): + def _is_valid_for_subcloud_sync(self, subcloud: models.Subcloud) -> str: """Verify subcloud data for sync.""" subcloud_name = subcloud.get("name") region_name = subcloud.get("region_name") @@ -485,10 +510,12 @@ class SystemPeerManager(manager.Manager): return VERIFY_SUBCLOUD_SYNC_VALID - def _validate_subclouds_for_sync(self, subclouds, dc_client): + def _validate_subclouds_for_sync( + self, subclouds: list[models.Subcloud], dc_client: DcmanagerClient + ) -> tuple[list[models.Subcloud], dict[str, str]]: """Validate subclouds for sync.""" valid_subclouds = [] - error_msg = {} # Dictinary to store error message for each subcloud + error_msg = {} # Dictionary to store error message for each subcloud for subcloud in subclouds: subcloud_name = subcloud.get("name") @@ -510,8 +537,8 @@ class SystemPeerManager(manager.Manager): peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name) if not peer_subcloud: LOG.info( - f"Subcloud {subcloud_name} (region_name: {region_name}) does " - "not exist on peer site." + f"Subcloud '{subcloud_name}' (region_name: {region_name}) does " + f"not exist on peer site '{dc_client.peer.peer_name}'" ) valid_subclouds.append(subcloud) continue @@ -541,7 +568,9 @@ class SystemPeerManager(manager.Manager): return valid_subclouds, error_msg - def _sync_subclouds(self, context, peer, dc_local_pg_id, dc_peer_pg_id): + def _sync_subclouds( + self, context, peer: models.SystemPeer, dc_local_pg_id: int, dc_peer_pg_id: int + ) -> dict[str, str]: """Sync subclouds of local peer group to peer site. :param context: request context object @@ -572,7 +601,10 @@ class SystemPeerManager(manager.Manager): ) error_msg.update(sync_error_msg) - LOG.info("Subcloud peer sync operation finished") + LOG.info( + f"Completed subcloud peer sync with peer '{peer.peer_name}': processed " + f"{len(subclouds_to_sync)} subclouds, {len(failed_subclouds)} failed" + ) dc_local_region_names = set() for subcloud in subclouds: @@ -591,7 +623,10 @@ class SystemPeerManager(manager.Manager): dc_peer_subcloud_diff_names = dc_peer_region_names - dc_local_region_names for subcloud_to_delete in dc_peer_subcloud_diff_names: try: - LOG.debug(f"Deleting Subcloud name {subcloud_to_delete} on peer site.") + LOG.info( + f"Deleting subcloud '{subcloud_to_delete}' on " + f"peer site '{dc_client.peer.peer_name}'" + ) self.delete_peer_secondary_subcloud(dc_client, subcloud_to_delete) except Exception as e: msg = f"Subcloud delete failed: {str(e)}" @@ -664,7 +699,7 @@ class SystemPeerManager(manager.Manager): context, peer_group_id ) if not associations: - LOG.debug("No association found for peer group %s" % peer_group_id) + LOG.info(f"No association found for peer group '{peer_group_id}'") else: for association in associations: if sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: @@ -722,7 +757,7 @@ class SystemPeerManager(manager.Manager): sync_status=new_sync_status, sync_message=new_sync_message, ) - LOG.debug( + LOG.info( f"Updated Local Peer Group Association {association.id} " f"sync_status to {new_sync_status}." ) @@ -786,8 +821,11 @@ class SystemPeerManager(manager.Manager): return success_peer_ids, failed_peer_ids def _get_non_primary_association( - self, dc_client, dc_peer_system_peer_id, dc_peer_pg_id - ): + self, + dc_client: DcmanagerClient, + dc_peer_system_peer_id: int, + dc_peer_pg_id: int, + ) -> Optional[dict]: """Get non-primary Association from peer site.""" try: return dc_client.get_peer_group_association_with_peer_id_and_pg_id( @@ -795,18 +833,23 @@ class SystemPeerManager(manager.Manager): ) except dccommon_exceptions.PeerGroupAssociationNotFound: LOG.error( - "Peer Group association does not exist on peer site. " - f"Peer Group ID: {dc_peer_pg_id}, Peer System Peer ID: " - f"{dc_peer_system_peer_id}" + "Peer group association does not exist on peer site " + f"'{dc_client.peer.peer_name}'. Peer group ID: {dc_peer_pg_id}, " + f"peer system peer ID: {dc_peer_system_peer_id}" ) return None - def _get_peer_site_pg_by_name(self, dc_client, peer_group_name): + def _get_peer_site_pg_by_name( + self, dc_client: DcmanagerClient, peer_group_name: str + ) -> Optional[dict]: """Get remote Peer Group from peer site by name.""" try: return dc_client.get_subcloud_peer_group(peer_group_name) except dccommon_exceptions.SubcloudPeerGroupNotFound: - LOG.error(f"Peer Group {peer_group_name} does not exist on peer site.") + LOG.error( + f"Peer group '{peer_group_name}' does not exist on " + f"peer site '{dc_client.peer.peer_name}'" + ) return None def _get_peer_site_system_peer(self, dc_client, peer_uuid=None): @@ -838,11 +881,6 @@ class SystemPeerManager(manager.Manager): :param association_id: id of association to sync :param sync_subclouds: Enabled to sync subclouds to peer site """ - LOG.info( - f"Synchronize the association {association_id} of the " - "Subcloud Peer Group with the System Peer pointing to the " - "peer site." - ) association = db_api.peer_group_association_get(context, association_id) peer = db_api.system_peer_get(context, association.system_peer_id) @@ -850,14 +888,19 @@ class SystemPeerManager(manager.Manager): peer_group_name = dc_local_pg.peer_group_name dc_peer_association_id = None + LOG.info( + f"Syncing peer group association {association_id} that links subcloud " + f"peer group '{peer_group_name}' with system peer '{peer.peer_name}'" + ) + try: # Check if the system_uuid of the peer site matches with the # peer_uuid system = self.get_peer_sysinv_client(peer).get_system() if system.uuid != peer.peer_uuid: LOG.error( - f"Peer site system uuid {system.uuid} does not match " - f"with the peer_uuid {peer.peer_uuid}" + f"Peer site '{peer.peer_name}' system uuid {system.uuid} does " + f"not match the configured peer_uuid: {peer.peer_uuid}" ) raise exceptions.PeerGroupAssociationTargetNotMatch(uuid=system.uuid) @@ -873,7 +916,8 @@ class SystemPeerManager(manager.Manager): if dc_peer_system_peer is None: failed_message = ( - f"System Peer {local_system_uuid} does not exist on peer site." + f"A system peer with UUID of {local_system_uuid} " + f"does not exist on peer site '{peer.peer_name}'" ) return db_api.peer_group_association_db_model_to_dict( self._update_sync_status_to_failed( @@ -896,7 +940,7 @@ class SystemPeerManager(manager.Manager): dc_peer_pg = dc_client.add_subcloud_peer_group(**peer_group_kwargs) LOG.info( f"Created Subcloud Peer Group {peer_group_name} on " - f"peer site. ID is {dc_peer_pg.get('id')}." + f"peer site '{peer.peer_name}'. ID is {dc_peer_pg.get('id')}." ) dc_peer_pg_id = dc_peer_pg.get("id") dc_peer_pg_priority = dc_peer_pg.get("group_priority") @@ -921,7 +965,7 @@ class SystemPeerManager(manager.Manager): ) LOG.info( "Created 'non-primary' Peer Group Association " - f"{dc_peer_association.get('id')} on peer site." + f"{dc_peer_association.get('id')} on peer site '{peer.peer_name}'" ) dc_peer_association_id = dc_peer_association.get("id") @@ -943,7 +987,7 @@ class SystemPeerManager(manager.Manager): ) LOG.info( f"Updated Subcloud Peer Group {peer_group_name} on " - f"peer site, ID is {dc_peer_pg.get('id')}." + f"peer site '{peer.peer_name}', ID is {dc_peer_pg.get('id')}" ) association_update = { @@ -998,7 +1042,6 @@ class SystemPeerManager(manager.Manager): :param context: request context object. :param association_id: id of association to delete """ - LOG.info(f"Deleting association peer group {association_id}.") # Retrieve the peer group association details from the database association = db_api.peer_group_association_get(context, association_id) @@ -1006,6 +1049,11 @@ class SystemPeerManager(manager.Manager): dc_local_pg = db_api.subcloud_peer_group_get(context, association.peer_group_id) peer_group_name = dc_local_pg.peer_group_name + LOG.info( + f"Deleting peer group association ({association_id=}, " + f"peer={peer.peer_name}, peer_group_id={dc_local_pg.id})" + ) + try: # Check if the system_uuid of the peer site matches with the # peer_uuid @@ -1035,7 +1083,7 @@ class SystemPeerManager(manager.Manager): # be deleted LOG.warning( f"Subcloud Peer Group {peer_group_name} does " - "not exist on peer site." + f"not exist on peer site '{peer.peer_name}'" ) return self._delete_primary_association(context, association_id) @@ -1045,7 +1093,7 @@ class SystemPeerManager(manager.Manager): if dc_peer_pg_priority == 0: LOG.error( f"Failed to delete peer_group_association. Peer Group: " - f"{peer_group_name} has priority 0 on peer site." + f"{peer_group_name} has priority 0 on peer site '{peer.peer_name}'" ) raise exceptions.SubcloudPeerGroupHasWrongPriority( priority=dc_peer_pg_priority @@ -1064,11 +1112,13 @@ class SystemPeerManager(manager.Manager): if delete_error_msg: LOG.error( "Failed to delete subcloud(s) from the Subcloud Peer Group " - f"{peer_group_name} on peer site: {json.dumps(delete_error_msg)}" + f"'{peer_group_name}' on peer site '{peer.peer_name}': " + f"{json.dumps(delete_error_msg)}" ) sync_message = ( f"Deletion of {list(delete_error_msg.keys())} from the " - f"Subcloud Peer Group: {peer_group_name} on the peer site failed." + f"Subcloud Peer Group: {peer_group_name} on the peer site " + f"'{peer.peer_name}' failed." ) self._update_sync_status_to_failed( context, association_id, sync_message @@ -1080,12 +1130,14 @@ class SystemPeerManager(manager.Manager): try: dc_client.delete_subcloud_peer_group(peer_group_name) LOG.info( - f"Deleted Subcloud Peer Group {peer_group_name} on peer site." + f"Deleted Subcloud Peer Group {peer_group_name} on " + f"peer site '{peer.peer_name}'" ) except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated: LOG.error( f"Subcloud Peer Group {peer_group_name} delete failed " - "as it is associated with System Peer on peer site." + f"as it is associated with System Peer on peer site " + f"'{peer.peer_name}'" ) return self._delete_primary_association(context, association_id) dc_peer_system_peer_id = dc_peer_system_peer.get("id") @@ -1100,18 +1152,21 @@ class SystemPeerManager(manager.Manager): dc_client.delete_peer_group_association(dc_peer_association_id) elif dc_peer_association is None: LOG.warning( - f"PeerGroupAssociation does not exist on peer site. " - f"Peer Group ID: {dc_peer_pg_id}, peer site System " - f"Peer ID: {dc_peer_system_peer_id}" + f"PeerGroupAssociation does not exist on peer site " + f"'{peer.peer_name}'. Peer Group ID: {dc_peer_pg_id}, " + f"peer site System Peer ID: {dc_peer_system_peer_id}" ) try: dc_client.delete_subcloud_peer_group(peer_group_name) - LOG.info(f"Deleted Subcloud Peer Group {peer_group_name} on peer site.") + LOG.info( + f"Deleted Subcloud Peer Group {peer_group_name} on " + f"peer site '{peer.peer_name}'" + ) except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated: failed_message = ( f"Subcloud Peer Group {peer_group_name} delete failed as it " - "is associated with system peer on peer site." + f"is associated with system peer on peer site '{peer.peer_name}'" ) self._update_sync_status_to_failed( context, association_id, failed_message diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_peer_monitor_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_peer_monitor_manager.py index 7ee2f2c36..a2afa986c 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_peer_monitor_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_peer_monitor_manager.py @@ -107,26 +107,57 @@ class TestPeerMonitor(base.DCManagerTestCase): ) def test_update_sync_status_and_association_is_non_primary(self): + # Delete the primary association created during setUp + db_api.peer_group_association_destroy(self.ctx, self.association.id) + + # Create a non-primary association association = self.system_peer_manager.create_peer_group_association_static( self.ctx, system_peer_id=self.peer.id, peer_group_id=self.peer_group2.id, association_type=consts.ASSOCIATION_TYPE_NON_PRIMARY, + sync_status=consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, ) - self.mock_get_peer_dc_client().get_subcloud_peer_group.return_value = { - "id": FAKE_SITE1_PEER_GROUP_ID - } - # Test the case where the association is non-primary - self.peer_monitor._update_sync_status_secondary_site_becomes_reachable() - self.mock_get_peer_dc_client().get_subcloud_peer_group.assert_called_once_with( - self.peer_group1.peer_group_name + # Mock update_sync_status + mock_patch = mock.patch.object( + peer_monitor_manager.SystemPeerManager, "update_sync_status" ) + mock_update_sync_status = mock_patch.start() + self.addCleanup(mock_patch.stop) + + self.peer_monitor._update_sync_status_secondary_site_becomes_reachable() + + # Assert that the association sync status was not updated + mock_update_sync_status.assert_not_called() association_new = db_api.peer_group_association_get(self.ctx, association.id) + self.assertEqual( + consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, association_new.sync_status + ) + + def test_update_sync_status_and_association_is_in_sync(self): + # Mock update_sync_status + mock_patch = mock.patch.object( + peer_monitor_manager.SystemPeerManager, "update_sync_status" + ) + mock_update_sync_status = mock_patch.start() + self.addCleanup(mock_patch.stop) + + self.peer_monitor._update_sync_status_secondary_site_becomes_reachable() + + # Assert that the association sync status was not updated as it's + # already in-sync + association_new = db_api.peer_group_association_get( + self.ctx, self.association.id + ) self.assertEqual( consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, association_new.sync_status ) + # But the update_sync_status still has to be triggered so it handles any + # sync message update + mock_update_sync_status.assert_called_once() + def test_update_sync_status_secondary_site_becomes_reachable(self): self.mock_get_local_system.return_value = test_system_peer_manager.FakeSystem( FAKE_SITE0_SYSTEM_UUID @@ -286,7 +317,7 @@ class TestPeerMonitor(base.DCManagerTestCase): mock_event.side_effect = [False, False, True] self.peer_monitor._do_monitor_peer() self.mock_log.exception.assert_called_with( - f"Problem clearing fault for peer {self.peer.peer_uuid}, alarm_id=" + f"Problem clearing fault for peer {self.peer.peer_name}, alarm_id=" f"{fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED} error: boom" ) @@ -319,7 +350,7 @@ class TestPeerMonitor(base.DCManagerTestCase): mock_event.side_effect = [False, False, True] self.peer_monitor._do_monitor_peer() self.mock_log.exception.assert_called_with( - "Got exception monitoring peer PeerSite1 error: boom" + "Unexpected error monitoring peer 'PeerSite1': boom" ) def test_heartbeat_check_via_get_peer_group_list_pg_not_found(self): @@ -327,8 +358,8 @@ class TestPeerMonitor(base.DCManagerTestCase): ret = self.peer_monitor._heartbeat_check_via_get_peer_group_list() self.mock_get_peer_dc_client.assert_called() self.mock_log.warning.assert_called_once_with( - "Resource subcloud peer group of dc:" - "http://128.128.128.128:5000/v3 not found" + "No subcloud peer groups found for DC peer: PeerSite1 " + "(endpoint: http://128.128.128.128:5000/v3)" ) self.assertEqual((False, []), ret) diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py index c4924a5ce..02b953cb7 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py @@ -143,6 +143,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_patch = mock.patch.object(system_peer_manager, "DcmanagerClient") self.mock_dc_client = mock_patch.start() + self.mock_dc_client().peer.peer_name = "SystemPeer1" self.addCleanup(mock_patch.stop) def _mock_system_peer_manager_peersitedriver(self): @@ -474,7 +475,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): "dcmanager.manager.system_peer_manager.SystemPeerManager.get_peer_dc_client" ) def test_update_sync_status_exception(self, mock_client): - mock_client.return_value = Exception("boom") + mock_client.side_effect = Exception("boom") self.spm.update_sync_status( self.ctx, self.peer, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC ) @@ -673,8 +674,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.ctx, self.peer, self.peer_group.id, FAKE_SITE1_PEER_GROUP_ID ) self.mock_log.error.assert_called_once_with( - f"Failed to add/update Subcloud {subcloud1.name} " - f"(region_name: {subcloud1.region_name}) on peer site: boom" + f"Failed to add/update subcloud '{subcloud1.name}' " + f"(region_name: {subcloud1.region_name}) on peer site 'SystemPeer1': boom" ) def test_sync_subclouds_delete_subcloud_exception(self): @@ -743,8 +744,11 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.mock_dc_client().get_subcloud.side_effect = [peer_subcloud4] self.peer_group_association.return_value = {"id": FAKE_SITE1_ASSOCIATION_ID} self.spm.delete_peer_group_association(self.ctx, self.association.id) - Calls = [ - mock.call("Deleting association peer group 1."), + calls = [ + mock.call( + "Deleting peer group association (association_id=1, " + "peer=SystemPeer1, peer_group_id=1)" + ), mock.call( f"Ignoring delete Peer Site Subcloud {subcloud.name} as " "is not in secondary or rehome failed state." @@ -755,10 +759,10 @@ class TestSystemPeerManager(base.DCManagerTestCase): ), mock.call( f"Deleted Subcloud Peer Group {self.peer_group.peer_group_name} " - "on peer site." + "on peer site 'SystemPeer1'" ), ] - self.mock_log.info.assert_has_calls(Calls) + self.mock_log.info.assert_has_calls(calls) def test_delete_peer_group_association_uuid_does_not_match(self): peer = self.create_system_peer_static( @@ -806,7 +810,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): associations = db_api.peer_group_association_get_all(self.ctx) self.assertEqual(1, len(associations)) self.mock_log.exception.assert_called_once_with( - "Failed to delete Subcloud subcloud1 on peer site: boom" + "Failed to delete subcloud 'subcloud1' on peer site 'SystemPeer1': boom" ) def test_delete_peer_group_association_failed(self): @@ -820,8 +824,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.association.id, ) self.mock_log.error.assert_called_once_with( - f"Subcloud Peer Group {self.peer_group.peer_group_name} " - "delete failed as it is associated with system peer on peer site." + f"Subcloud Peer Group {self.peer_group.peer_group_name} delete " + "failed as it is associated with system peer on peer site 'SystemPeer1'" ) def test_delete_peer_group_association_subcloud_pg_notfound(self): @@ -831,7 +835,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.spm.delete_peer_group_association(self.ctx, self.association.id) self.mock_log.warning.assert_called_once_with( f"Subcloud Peer Group {self.peer_group.peer_group_name} " - "does not exist on peer site." + "does not exist on peer site 'SystemPeer1'" ) associations = db_api.peer_group_association_get_all(self.ctx) self.assertEqual(0, len(associations))