From 902a4891ada77e3ccc435cf369cdc1bc3d91518a Mon Sep 17 00:00:00 2001 From: Gustavo Herzmann Date: Tue, 26 Nov 2024 18:41:18 -0300 Subject: [PATCH] Fix DC geo-redundancy systemcontroller_gateway_address syncing Previously, when the secondary site association sync status was set to 'out-of-sync', when the primary site becomes online again, it would update the primary site rehome-data with the secondary site data. This causes an issue because the systemcontroller_gateway_address present on the rehome-data is exclusive to each site and can't be shared across sites, causing an invalid route being created on the primary site. This commit prevents that by only syncing all the other rehome-data attributes, except the systemcontroller_gateway_address. This commit also prevents the association sync status from being set as 'out-of-sync' if the local association is already marked as 'in-sync' whenever the primary site detects that the secondary site is reachable. It also causes the peer monitor thread to immediately audit the peer site after the association is created, instead of waiting for the heartbeat interval. Additionally, this commit fixes an incorrect dictionary key usage from "sync_status" to "sync-status" that caused the peer site association sync status to be updated unnecessarily. It also fixes an issue introduced by [1] where the subcloud update would fail when there was an update in the systemcontroller_gateway_address attribute because the validation function was expecting some attributes that were not available as part of the request payload. This commit also improves some log messages and adds type annotations to some geo-redundancy related methods. Test Plan: 01. PASS - Run an end-to-end geo-redundancy test, migrating a subcloud from the primary site to the secondary and back. 02. PASS - Re-run the GR test, but cause a failure during the migration to the secondary site. Update the bootstrap values on the secondary site and verify that the sync-status is out-of-sync. Run the migration to secondary site again, then migrate back to primary site and verify that the rehome-data synchronization does not synchronize the systemcontroller_gateway_address attribute. 03. PASS: Do the following steps: - Create a system peer with an incorrect systemcontroller gateway address that's inside the management subnet, but outside the reserved IP range and then create an association. Verify that the secondary subcloud and a route was created using the incorrect IP. - Update the system peer with the correct systemcontroller gateway address on the primary site. Verify that the PGA sync status is set to 'out-of-sync' on both sites. - Sync the PGA and verify that the secondary subcloud systemcontroller gateway address was updated and that the old route was deleted and a new one using the new address was created. - Migrate the SPG to the non-primary site and verify that it completes successfully and that the subcloud becomes online and managed. 04. PASS - After creating a peer group and the association, verify that the peer monitor thread is started and that the first heartbeat check is executed without waiting for the heartbeat interval. [1]: https://review.opendev.org/c/starlingx/distcloud/+/922255 Closes-Bug: 2089715 Change-Id: I857f30e2d691dfb18196f123ba5a2a52fd8ddb64 Signed-off-by: Gustavo Herzmann --- .../drivers/openstack/dcmanager_v1.py | 3 + .../dcmanager/api/controllers/v1/subclouds.py | 5 +- distributedcloud/dcmanager/db/api.py | 36 ++- .../dcmanager/db/sqlalchemy/api.py | 14 +- .../manager/peer_group_audit_manager.py | 95 +++++-- .../dcmanager/manager/peer_monitor_manager.py | 263 ++++++++++++------ .../dcmanager/manager/subcloud_manager.py | 5 + .../dcmanager/manager/system_peer_manager.py | 199 ++++++++----- .../unit/manager/test_peer_monitor_manager.py | 53 +++- .../unit/manager/test_system_peer_manager.py | 26 +- 10 files changed, 473 insertions(+), 226 deletions(-) diff --git a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py index b02ee364e..bb98d7508 100644 --- a/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py +++ b/distributedcloud/dccommon/drivers/openstack/dcmanager_v1.py @@ -10,6 +10,7 @@ from requests_toolbelt import MultipartEncoder from dccommon import consts from dccommon.drivers import base from dccommon import exceptions +from dcmanager.db.sqlalchemy import models LOG = log.getLogger(__name__) @@ -27,6 +28,7 @@ class DcmanagerClient(base.DriverBase): timeout=DCMANAGER_CLIENT_REST_DEFAULT_TIMEOUT, endpoint_type=consts.KS_ENDPOINT_PUBLIC, endpoint=None, + peer: models.SystemPeer = None, ): if endpoint is None: endpoint = session.get_endpoint( @@ -35,6 +37,7 @@ class DcmanagerClient(base.DriverBase): self.endpoint = endpoint self.timeout = timeout self.session = session + self.peer = peer def get_system_peer(self, system_peer_uuid): """Get system peer.""" diff --git a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py index dcd08da0c..2fa5220fc 100644 --- a/distributedcloud/dcmanager/api/controllers/v1/subclouds.py +++ b/distributedcloud/dcmanager/api/controllers/v1/subclouds.py @@ -1076,8 +1076,11 @@ class SubcloudsController(object): systemcontroller_gateway_address.split(",")[0] != subcloud.systemcontroller_gateway_ip ): + # Pass bootstrap_values_dict for patch operations where these + # values aren't in payload, unlike subcloud add where they are. + # Function needs management/admin_subnet for validation psd_common.validate_systemcontroller_gateway_address( - systemcontroller_gateway_address, payload + systemcontroller_gateway_address, bootstrap_values_dict ) management_state = payload.get("management-state") diff --git a/distributedcloud/dcmanager/db/api.py b/distributedcloud/dcmanager/db/api.py index da798e3a7..3e4ab0b88 100644 --- a/distributedcloud/dcmanager/db/api.py +++ b/distributedcloud/dcmanager/db/api.py @@ -541,27 +541,27 @@ def system_peer_create( ) -def system_peer_get(context, peer_id): +def system_peer_get(context, peer_id) -> models.SystemPeer: """Retrieve a system_peer or raise if it does not exist.""" return IMPL.system_peer_get(context, peer_id) -def system_peer_get_by_uuid(context, uuid): +def system_peer_get_by_uuid(context, uuid) -> models.SystemPeer: """Retrieve a system_peer by uuid or raise if it does not exist.""" return IMPL.system_peer_get_by_uuid(context, uuid) -def system_peer_get_by_name(context, uuid): +def system_peer_get_by_name(context, uuid) -> models.SystemPeer: """Retrieve a system_peer by name or raise if it does not exist.""" return IMPL.system_peer_get_by_name(context, uuid) -def system_peer_get_all(context): +def system_peer_get_all(context) -> list[models.SystemPeer]: """Retrieve all system peers.""" return IMPL.system_peer_get_all(context) -def peer_group_get_for_system_peer(context, peer_id): +def peer_group_get_for_system_peer(context, peer_id) -> list[models.SubcloudPeerGroup]: """Get subcloud peer groups associated with a system peer.""" return IMPL.peer_group_get_for_system_peer(context, peer_id) @@ -656,22 +656,24 @@ def subcloud_peer_group_destroy(context, group_id): return IMPL.subcloud_peer_group_destroy(context, group_id) -def subcloud_peer_group_get(context, group_id): +def subcloud_peer_group_get(context, group_id) -> models.SubcloudPeerGroup: """Retrieve a subcloud_peer_group or raise if it does not exist.""" return IMPL.subcloud_peer_group_get(context, group_id) -def subcloud_peer_group_get_by_name(context, name): +def subcloud_peer_group_get_by_name(context, name) -> models.SubcloudPeerGroup: """Retrieve a subcloud_peer_group by name or raise if it does not exist.""" return IMPL.subcloud_peer_group_get_by_name(context, name) -def subcloud_peer_group_get_by_leader_id(context, system_leader_id): +def subcloud_peer_group_get_by_leader_id( + context, system_leader_id +) -> list[models.SubcloudPeerGroup]: """Retrieve subcloud peer groups by system_leader_id.""" return IMPL.subcloud_peer_group_get_by_leader_id(context, system_leader_id) -def subcloud_get_for_peer_group(context, group_id): +def subcloud_get_for_peer_group(context, group_id) -> list[models.Subcloud]: """Retrieve all subclouds belonging to a subcloud_peer_group or raise if it does not exist. @@ -679,7 +681,7 @@ def subcloud_get_for_peer_group(context, group_id): return IMPL.subcloud_get_for_peer_group(context, group_id) -def subcloud_peer_group_get_all(context): +def subcloud_peer_group_get_all(context) -> list[models.SubcloudPeerGroup]: """Retrieve all subcloud peer groups.""" return IMPL.subcloud_peer_group_get_all(context) @@ -765,31 +767,35 @@ def peer_group_association_destroy(context, id): return IMPL.peer_group_association_destroy(context, id) -def peer_group_association_get(context, id): +def peer_group_association_get(context, id) -> models.PeerGroupAssociation: """Retrieve a peer_group_association or raise if it does not exist.""" return IMPL.peer_group_association_get(context, id) -def peer_group_association_get_all(context): +def peer_group_association_get_all(context) -> list[models.PeerGroupAssociation]: """Retrieve all peer_group_associations.""" return IMPL.peer_group_association_get_all(context) def peer_group_association_get_by_peer_group_and_system_peer_id( context, peer_group_id, system_peer_id -): +) -> list[models.PeerGroupAssociation]: """Get peer group associations by peer_group_id and system_peer_id.""" return IMPL.peer_group_association_get_by_peer_group_and_system_peer_id( context, peer_group_id, system_peer_id ) -def peer_group_association_get_by_peer_group_id(context, peer_group_id): +def peer_group_association_get_by_peer_group_id( + context, peer_group_id +) -> list[models.PeerGroupAssociation]: """Get the peer_group_association list by peer_group_id""" return IMPL.peer_group_association_get_by_peer_group_id(context, peer_group_id) -def peer_group_association_get_by_system_peer_id(context, system_peer_id): +def peer_group_association_get_by_system_peer_id( + context, system_peer_id +) -> list[models.PeerGroupAssociation]: """Get the peer_group_association list by system_peer_id""" return IMPL.peer_group_association_get_by_system_peer_id(context, system_peer_id) diff --git a/distributedcloud/dcmanager/db/sqlalchemy/api.py b/distributedcloud/dcmanager/db/sqlalchemy/api.py index 39f40a35d..46dfe1a3e 100644 --- a/distributedcloud/dcmanager/db/sqlalchemy/api.py +++ b/distributedcloud/dcmanager/db/sqlalchemy/api.py @@ -1647,7 +1647,7 @@ def peer_group_association_destroy(context, association_id): @require_context -def peer_group_association_get(context, association_id): +def peer_group_association_get(context, association_id) -> models.PeerGroupAssociation: try: result = ( model_query(context, models.PeerGroupAssociation) @@ -1666,7 +1666,7 @@ def peer_group_association_get(context, association_id): @require_context -def peer_group_association_get_all(context): +def peer_group_association_get_all(context) -> list[models.PeerGroupAssociation]: result = ( model_query(context, models.PeerGroupAssociation) .filter_by(deleted=0) @@ -1682,7 +1682,7 @@ def peer_group_association_get_all(context): @require_context def peer_group_association_get_by_peer_group_and_system_peer_id( context, peer_group_id, system_peer_id -): +) -> models.PeerGroupAssociation: try: result = ( model_query(context, models.PeerGroupAssociation) @@ -1705,7 +1705,9 @@ def peer_group_association_get_by_peer_group_and_system_peer_id( @require_context -def peer_group_association_get_by_peer_group_id(context, peer_group_id): +def peer_group_association_get_by_peer_group_id( + context, peer_group_id +) -> models.PeerGroupAssociation: result = ( model_query(context, models.PeerGroupAssociation) .filter_by(deleted=0) @@ -1718,7 +1720,9 @@ def peer_group_association_get_by_peer_group_id(context, peer_group_id): @require_context -def peer_group_association_get_by_system_peer_id(context, system_peer_id): +def peer_group_association_get_by_system_peer_id( + context, system_peer_id +) -> models.PeerGroupAssociation: result = ( model_query(context, models.PeerGroupAssociation) .filter_by(deleted=0) diff --git a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py index c312218f8..fb67ae8a6 100644 --- a/distributedcloud/dcmanager/manager/peer_group_audit_manager.py +++ b/distributedcloud/dcmanager/manager/peer_group_audit_manager.py @@ -3,8 +3,10 @@ # # SPDX-License-Identifier: Apache-2.0 # - +from __future__ import annotations +import json import threading +from typing import TYPE_CHECKING from fm_api import constants as fm_const from fm_api import fm_api @@ -12,14 +14,20 @@ from oslo_config import cfg from oslo_log import log as logging from dccommon import consts as dccommon_consts +from dccommon.drivers.openstack.dcmanager_v1 import DcmanagerClient from dcmanager.common import consts from dcmanager.common import context from dcmanager.common.i18n import _ from dcmanager.common import manager from dcmanager.common import utils from dcmanager.db import api as db_api +from dcmanager.db.sqlalchemy import models from dcmanager.manager.system_peer_manager import SystemPeerManager +# Use TYPE_CHECKING to avoid circular import +if TYPE_CHECKING: + from dcmanager.manager.subcloud_manager import SubcloudManager + CONF = cfg.CONF LOG = logging.getLogger(__name__) @@ -27,7 +35,9 @@ LOG = logging.getLogger(__name__) class PeerGroupAuditManager(manager.Manager): """Manages audit related tasks.""" - def __init__(self, subcloud_manager, peer_group_id, *args, **kwargs): + def __init__( + self, subcloud_manager: SubcloudManager, peer_group_id: int, *args, **kwargs + ): LOG.debug(_("PeerGroupAuditManager initialization...")) super().__init__(service_name="peer_group_audit_manager", *args, **kwargs) self.context = context.get_admin_context() @@ -52,8 +62,8 @@ class PeerGroupAuditManager(manager.Manager): @staticmethod def _get_association_sync_status_from_peer_site( - dc_client, system_peer, peer_group_id - ): + dc_client: DcmanagerClient, system_peer: models.SystemPeer, peer_group_id: int + ) -> str: try: # Get peer site system peer dc_peer_system_peer = dc_client.get_system_peer( @@ -81,8 +91,11 @@ class PeerGroupAuditManager(manager.Manager): ) def _get_local_subclouds_to_update_and_delete( - self, local_peer_group, remote_subclouds, remote_sync_status - ): + self, + local_peer_group: models.SubcloudPeerGroup, + remote_subclouds: list[dict], + remote_sync_status: str, + ) -> tuple[list[models.Subcloud], list[models.Subcloud], bool]: local_subclouds_to_update = list() local_subclouds_to_delete = list() any_rehome_failed = False @@ -112,8 +125,13 @@ class PeerGroupAuditManager(manager.Manager): # indicating any bootstrap values/address updates to # the subcloud on the remote site. if remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: + LOG.info( + "Peer association is out-of-sync, syncing rehome " + f"data of subcloud '{local_subcloud.name}' from " + "peer to current site" + ) self._sync_rehome_data( - local_subcloud.id, remote_subcloud.get("rehome_data") + local_subcloud, remote_subcloud.get("rehome_data") ) elif remote_subcloud.get("deploy-status") in ( consts.DEPLOY_STATE_REHOME_FAILED, @@ -134,7 +152,7 @@ class PeerGroupAuditManager(manager.Manager): return local_subclouds_to_update, local_subclouds_to_delete, any_rehome_failed - def _set_local_subcloud_to_secondary(self, subcloud): + def _set_local_subcloud_to_secondary(self, subcloud: models.Subcloud) -> None: try: LOG.info("Set local subcloud %s to secondary" % subcloud.name) # There will be an exception when unmanage @@ -155,18 +173,42 @@ class PeerGroupAuditManager(manager.Manager): ) raise e - def _sync_rehome_data(self, subcloud_id, rehome_data): - db_api.subcloud_update(self.context, subcloud_id, rehome_data=rehome_data) + def _sync_rehome_data(self, subcloud: models.Subcloud, rehome_data: str) -> None: + try: + remote_rehome_data = json.loads(rehome_data) + local_rehome_data = json.loads(subcloud.rehome_data) - def audit(self, system_peer, remote_peer_group, local_peer_group): + # The systemcontroller_gateway_address can't be synced from the + # peer to the local site as it's specific to each site + remote_rehome_data["saved_payload"]["systemcontroller_gateway_address"] = ( + local_rehome_data["saved_payload"]["systemcontroller_gateway_address"] + ) + new_rehome_data = json.dumps(remote_rehome_data) + + db_api.subcloud_update( + self.context, subcloud.id, rehome_data=new_rehome_data + ) + except Exception as e: + LOG.error( + "Unable to sync rehome data of subcloud " + f"'{subcloud.name}' from peer to current site: {str(e)}" + ) + raise + + def audit( + self, + system_peer: models.SystemPeer, + remote_peer_group: dict, + local_peer_group: models.SubcloudPeerGroup, + ) -> None: if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING: LOG.info("Local peer group in migrating state, quit audit") return LOG.info( - "Auditing remote subcloud peer group:[%s] migration_status:[%s] " - "group_priority[%s], local subcloud peer group:[%s] " - "migration_status:[%s] group_priority[%s]" + "Auditing remote subcloud peer group: [%s], migration_status: [%s], " + "group_priority: [%s], local subcloud peer group: [%s], " + "migration_status: [%s], group_priority: [%s]" % ( remote_peer_group.get("peer_group_name"), remote_peer_group.get("migration_status"), @@ -277,8 +319,8 @@ class PeerGroupAuditManager(manager.Manager): ) LOG.exception( f"Failed to delete local subcloud [{subcloud.name}] that does " - "not exist under the same subcloud_peer_group on peer site, " - f"err: {e}" + "not exist under the same subcloud_peer_group on peer site " + f"{system_peer.peer_name}, err: {e}" ) raise e @@ -313,7 +355,12 @@ class PeerGroupAuditManager(manager.Manager): # If remote peer group migration_status is 'None' self.require_audit_flag = False - def _clear_or_raise_alarm(self, system_peer, local_peer_group, remote_peer_group): + def _clear_or_raise_alarm( + self, + system_peer: models.SystemPeer, + local_peer_group: models.SubcloudPeerGroup, + remote_peer_group: dict, + ) -> None: # If local subcloud peer group's group_priority is # lower than remote subcloud peer group's group_priority, # an alarm will be raised. @@ -325,7 +372,7 @@ class PeerGroupAuditManager(manager.Manager): if local_peer_group.group_priority < remote_peer_group.get("group_priority"): LOG.warning( f"Alarm: local subcloud peer group [{local_peer_group.peer_group_name}]" - f" is managed by remote system [{system_peer.peer_name}]" + f" is managed by remote system peer [{system_peer.peer_name}]" ) try: fault = fm_api.Fault( @@ -336,7 +383,7 @@ class PeerGroupAuditManager(manager.Manager): severity=fm_const.FM_ALARM_SEVERITY_MAJOR, reason_text=( "Subcloud peer group (peer_group_name=%s) is managed by " - "remote system (peer_uuid=%s) with a lower priority." + "remote system peer (peer_uuid=%s) with a lower priority." % (local_peer_group.peer_group_name, system_peer.peer_uuid) ), alarm_type=fm_const.FM_ALARM_TYPE_0, @@ -375,12 +422,12 @@ class PeerGroupAuditManager(manager.Manager): try: self.audit(system_peer, remote_peer_group, local_peer_group) except Exception as e: - LOG.exception("audit error occured: %s" % e) + LOG.exception("audit error occurred: %s" % e) def stop(self): if self.thread: self.thread.join() - LOG.info(f"stopped peer group {self.peer_group_id} audit thread") + LOG.info(f"Stopped peer group {self.peer_group_id} audit thread") else: LOG.info(f"No peer group {self.peer_group_id} audit thread to stop") @@ -402,12 +449,14 @@ class PeerGroupAuditManager(manager.Manager): ): LOG.info( f"Audit peer group [{local_peer_group.peer_group_name}] " - f"with remote system {system_peer.peer_name}" + f"with remote system peer {system_peer.peer_name}" ) self.start(system_peer, remote_peer_group, local_peer_group) @staticmethod - def send_audit_peer_group(system_peers, peer_group): + def send_audit_peer_group( + system_peers: list[models.SystemPeer], peer_group: models.SubcloudPeerGroup + ): if not system_peers: return local_system = utils.get_local_system() diff --git a/distributedcloud/dcmanager/manager/peer_monitor_manager.py b/distributedcloud/dcmanager/manager/peer_monitor_manager.py index 9f698f94e..437fb6056 100644 --- a/distributedcloud/dcmanager/manager/peer_monitor_manager.py +++ b/distributedcloud/dcmanager/manager/peer_monitor_manager.py @@ -9,6 +9,7 @@ import threading from fm_api import constants as fm_const from fm_api import fm_api +from keystoneauth1 import exceptions as ks_exceptions from oslo_config import cfg from oslo_log import log as logging @@ -16,15 +17,58 @@ from dcmanager.common import consts from dcmanager.common import context from dcmanager.common import manager from dcmanager.db import api as db_api +from dcmanager.db.sqlalchemy import models from dcmanager.manager import peer_group_audit_manager as pgam +from dcmanager.manager.subcloud_manager import SubcloudManager from dcmanager.manager.system_peer_manager import SystemPeerManager CONF = cfg.CONF LOG = logging.getLogger(__name__) +# Upon detecting that the secondary site is REACHABLE again, the PGA sync_status +# will be set for BOTH sites by the primary site monitor thread as follows: +SECONDARY_SITE_REACHABLE_TRANSITIONS = { + # The UNKNOWN and FAILED are set when the secondary site becomes unreachable + consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, + consts.ASSOCIATION_SYNC_STATUS_FAILED: consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC, + # The SYNCING and IN_SYNC status can happen when first creating the peer and the + # association. The association creation will set the status to SYNCING/IN_SYNC, + # and the monitor thread might detect the peer availability after that, so we + # don't modify the status + consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, + consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_SYNCING, + # A similar situation can happen with OUT_OF_SYNC, if the association/peer group + # is updated before the monitor thread runs its first check, so we don't modify + # the status + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: ( + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC + ), +} + +# Upon detecting that the secondary site is UNREACHABLE, the PGA sync_status +# will be set for the PIRMARY SITE ONLY by the primary site monitor thread as follows: +SECONDARY_SITE_UNREACHABLE_TRANSITIONS = { + # After the secondary site becomes unreachable, the only valid sync statuses + # are UNKNOWN or FAILED + consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, + consts.ASSOCIATION_SYNC_STATUS_FAILED: consts.ASSOCIATION_SYNC_STATUS_FAILED, + consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: consts.ASSOCIATION_SYNC_STATUS_FAILED, + consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, + consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_FAILED, +} + class PeerMonitor(object): - def __init__(self, peer, context, subcloud_manager): + """Monitors and manages the connection status of a single DC system peer + + The monitor runs in a separate thread and periodically checks the peer's + availability using heartbeat mechanisms. It handles both failure and recovery + scenarios, updating system states and managing fault alarms accordingly. + """ + + def __init__( + self, peer: models.SystemPeer, context, subcloud_manager: SubcloudManager + ): self.peer = peer self.thread = None self.exit_flag = threading.Event() @@ -32,21 +76,22 @@ class PeerMonitor(object): self.context = context self.subcloud_manager = subcloud_manager self.peer_group_id_set = set() + self.failure_count = 0 # key: peer_group_id # value: PeerGroupAuditManager object - self.peer_group_audit_obj_map = dict() + self.peer_group_audit_obj_map: dict[int, pgam.PeerGroupAuditManager] = dict() def _clear_failure(self): alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED - entity_instance_id = "peer=%s" % self.peer.peer_uuid + entity_instance_id = f"peer={self.peer.peer_uuid}" try: fault = self.fm_api.get_fault(alarm_id, entity_instance_id) if fault: self.fm_api.clear_fault(alarm_id, entity_instance_id) except Exception as e: LOG.exception( - "Problem clearing fault for peer %s, alarm_id=%s error: %s" - % (self.peer.peer_uuid, alarm_id, e) + f"Problem clearing fault for peer {self.peer.peer_name}, " + f"alarm_id={alarm_id} error: {str(e)}" ) def _raise_failure(self): @@ -114,124 +159,165 @@ class PeerMonitor(object): if not dc_peer_subcloud_peer_group_list: LOG.warning( - "Resource subcloud peer group of dc:%s not found" - % self.peer.manager_endpoint + f"No subcloud peer groups found for DC peer: {self.peer.peer_name} " + f"(endpoint: {self.peer.manager_endpoint})" ) + except (ks_exceptions.ConnectFailure, ks_exceptions.ConnectTimeout) as e: + # Use warning level for common connection failures to reduce log spam + LOG.warning(f"Failed to access DC peer {self.peer.peer_name}: {str(e)}") except Exception: - LOG.exception("Failed to access the dc: %s" % self.peer.peer_name) + LOG.exception( + f"Unexpected error while accessing DC peer {self.peer.peer_name}" + ) + return failed, dc_peer_subcloud_peer_group_list - def _update_sync_status_secondary_site_becomes_unreachable(self): + def _update_sync_status_secondary_site_becomes_unreachable(self) -> None: + """Update sync status on local site when secondary site becomes unreachable.""" # Get associations by system peer associations = SystemPeerManager.get_local_associations(self.context, self.peer) for association in associations: # If the association is not primary, skip it. if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: - LOG.debug( - "Skip update the Association sync_status as it is not primary." + LOG.info( + f"Skip updating local association (id={association.id}) " + "sync_status as it's not primary" ) continue - # If the secondary site is down, set the association sync status - # "in-sync" -> "unknown" - # "unknown" -> "unknown" - # "out-of-sync" -> "failed" - # "syncing" -> "failed" - # "failed" -> "failed" - sync_status = consts.ASSOCIATION_SYNC_STATUS_UNKNOWN - message = f"Peer site ({self.peer.peer_name}) is unreachable." - if association.sync_status not in [ - consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, - consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, - ]: - sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED + + try: + new_sync_status = SECONDARY_SITE_UNREACHABLE_TRANSITIONS[ + association.sync_status + ] + except KeyError: + # This should never happen + LOG.error( + f"Unexpected sync status on association id={association.id}: " + f"{association.sync_status}. Updating the sync status to " + f"{consts.ASSOCIATION_SYNC_STATUS_FAILED}" + ) + new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED + + LOG.info( + f"Updating local association (id={association.id}) sync_status " + f"from {association.sync_status} to {new_sync_status} due " + "to secondary site becoming unreachable" + ) + db_api.peer_group_association_update( self.context, association.id, - sync_status=sync_status, - sync_message=message, + sync_status=new_sync_status, + sync_message=f"Peer site ({self.peer.peer_name}) is unreachable.", ) - def _update_sync_status_secondary_site_becomes_reachable(self): + def _update_sync_status_secondary_site_becomes_reachable(self) -> None: + """Update sync status on both sites when secondary site becomes reachable.""" # Get associations by system peer associations = SystemPeerManager.get_local_associations(self.context, self.peer) for association in associations: # If the association is not primary, skip it. if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: - LOG.debug( - "Skip update Peer Site Association sync_status as " - "current site Association is not primary." + LOG.info( + "Skip updating association sync_status on both sites as " + "current site association is not primary." ) continue - # Upon detecting that the secondary site is reachable again, - # the PGA sync_status will be set for both sites by the primary - # site monitor thread as follows: - # "unknown" -> "in-sync" - # "failed" -> "out-of-sync" - sync_status = consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC - if association.sync_status == consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: - sync_status = consts.ASSOCIATION_SYNC_STATUS_IN_SYNC + + try: + new_sync_status = SECONDARY_SITE_REACHABLE_TRANSITIONS[ + association.sync_status + ] + except KeyError: + # This should never happen + LOG.error( + f"Unexpected sync status on association id={association.id}: " + f"{association.sync_status}. Updating the sync status to " + f"{consts.ASSOCIATION_SYNC_STATUS_FAILED}" + ) + new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED + dc_local_pg = db_api.subcloud_peer_group_get( self.context, association.peer_group_id ) + + LOG.info( + f"Updating local association (id={association.id}) sync_status " + f"from {association.sync_status} to {new_sync_status} due " + "to secondary site becoming reachable" + ) + SystemPeerManager.update_sync_status( self.context, self.peer, - sync_status, + new_sync_status, dc_local_pg, association=association, ) - def _do_monitor_peer(self): - failure_count = 0 - LOG.info("Start monitoring thread for peer %s" % self.peer.peer_name) - UNAVAILABLE_STATE = consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE - AVAILABLE_STATE = consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE - # Do the actual peer monitor. + def _update_peer_state(self, state: str) -> models.SystemPeer: + """Update peer availability state in database.""" + return db_api.system_peer_update( + self.context, self.peer.id, availability_state=state + ) + + def _handle_peer_failure(self) -> None: + """Handle peer failure by raising alarms and updating states.""" + self.failure_count += 1 + if self.failure_count >= self.peer.heartbeat_failure_threshold: + LOG.warning( + f"DC peer '{self.peer.peer_name}' heartbeat failed, raising alarm" + ) + self._raise_failure() + self._update_peer_state(consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE) + self._update_sync_status_secondary_site_becomes_unreachable() + self.failure_count = 0 + self._set_require_audit_flag_to_associated_peer_groups() + + def _handle_peer_recovery(self, remote_pg_list: list) -> None: + """Handle peer recovery by clearing alarms and restoring states.""" + self.failure_count = 0 + self._audit_local_peer_groups(remote_pg_list) + if ( + self.peer.availability_state + != consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE + ): + LOG.info(f"DC peer '{self.peer.peer_name}' is back online, clearing alarm") + self._update_peer_state(consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE) + self._update_sync_status_secondary_site_becomes_reachable() + self._clear_failure() + + def _check_peer_status(self) -> None: + """Perform a single peer status check and handle the result.""" + self.peer = db_api.system_peer_get(self.context, self.peer.id) + failed, remote_pg_list = self._heartbeat_check_via_get_peer_group_list() + + if failed: + self._handle_peer_failure() + else: + self._handle_peer_recovery(remote_pg_list) + + def _do_monitor_peer(self) -> None: + """Monitor peer connectivity and handle state changes.""" + LOG.info(f"Start monitoring thread for peer '{self.peer.peer_name}'") + + # Run first check immediately + try: + self._check_peer_status() + except Exception as e: + LOG.exception(f"Initial check failed for peer '{self.peer.peer_name}': {e}") + + # Continue with periodic checks while not self.exit_flag.wait(timeout=self.peer.heartbeat_interval): try: - # Get system peer from DB - self.peer = db_api.system_peer_get(self.context, self.peer.id) - failed, remote_pg_list = self._heartbeat_check_via_get_peer_group_list() - if failed: - failure_count += 1 - if failure_count >= self.peer.heartbeat_failure_threshold: - # heartbeat_failure_threshold reached. - LOG.warning( - "DC %s heartbeat failed, Raising alarm" - % self.peer.peer_name - ) - self._raise_failure() - db_api.system_peer_update( - self.context, - self.peer.id, - availability_state=UNAVAILABLE_STATE, - ) - # pylint: disable=line-too-long - self._update_sync_status_secondary_site_becomes_unreachable() - failure_count = 0 - self._set_require_audit_flag_to_associated_peer_groups() - else: - failure_count = 0 - self._audit_local_peer_groups(remote_pg_list) - if self.peer.availability_state != AVAILABLE_STATE: - db_api.system_peer_update( - self.context, - self.peer.id, - availability_state=AVAILABLE_STATE, - ) - # pylint: disable=line-too-long - self._update_sync_status_secondary_site_becomes_reachable() - LOG.info("DC %s back online, clear alarm" % self.peer.peer_name) - self._clear_failure() + self._check_peer_status() except Exception as e: LOG.exception( - "Got exception monitoring peer %s error: %s" - % (self.peer.peer_name, e) + f"Unexpected error monitoring peer '{self.peer.peer_name}': {e}" ) - LOG.info( - "Caught graceful exit signal for peer monitor %s" % self.peer.peer_name - ) + + LOG.info(f"Gracefully exiting monitor thread for peer '{self.peer.peer_name}'") def _audit_local_peer_groups(self, remote_pg_list): # Generate a dict index by remote peer group name @@ -266,7 +352,9 @@ class PeerMonitor(object): for pgam_obj in self.peer_group_audit_obj_map.values(): pgam_obj.require_audit_flag = True - def audit_specific_local_peer_group(self, peer_group, remote_peer_group): + def audit_specific_local_peer_group( + self, peer_group: models.SubcloudPeerGroup, remote_peer_group + ): msg = None if peer_group.id in self.peer_group_audit_obj_map: pgam_obj = self.peer_group_audit_obj_map[peer_group.id] @@ -324,17 +412,16 @@ class PeerMonitor(object): class PeerMonitorManager(manager.Manager): """Manages tasks related to peer monitor.""" - def __init__(self, subcloud_manager): + def __init__(self, subcloud_manager: SubcloudManager): LOG.debug("PeerMonitorManager initialization...") super(PeerMonitorManager, self).__init__(service_name="peer_monitor_manager") - self.peer_monitor = dict() self.context = context.get_admin_context() self.subcloud_manager = subcloud_manager # key: system_peer_id # value: PeerMonitor object - self.peer_monitor_thread_map = dict() + self.peer_monitor_thread_map: dict[int, PeerMonitor] = dict() def _remove_peer_monitor_task(self, system_peer_id): peer_mon_obj = self.peer_monitor_thread_map[system_peer_id] diff --git a/distributedcloud/dcmanager/manager/subcloud_manager.py b/distributedcloud/dcmanager/manager/subcloud_manager.py index 9e0903cf6..4433df606 100644 --- a/distributedcloud/dcmanager/manager/subcloud_manager.py +++ b/distributedcloud/dcmanager/manager/subcloud_manager.py @@ -3668,6 +3668,11 @@ class SubcloudManager(manager.Manager): and systemcontroller_gateway_ip.split(",")[0] != subcloud.systemcontroller_gateway_ip ): + LOG.info( + f"The systemcontroller_gateway_ip for subcloud {subcloud.name} " + f"was updated from {subcloud.systemcontroller_gateway_ip} to " + f"{systemcontroller_gateway_ip.split(',')[0]}. Replacing routes..." + ) m_ks_client = OpenStackDriver( region_name=dccommon_consts.DEFAULT_REGION_NAME, region_clients=None, diff --git a/distributedcloud/dcmanager/manager/system_peer_manager.py b/distributedcloud/dcmanager/manager/system_peer_manager.py index 0270d52de..eb212f6e5 100644 --- a/distributedcloud/dcmanager/manager/system_peer_manager.py +++ b/distributedcloud/dcmanager/manager/system_peer_manager.py @@ -9,6 +9,7 @@ from contextlib import nullcontext import functools import json import tempfile +from typing import Optional from eventlet import greenpool from oslo_log import log as logging @@ -54,7 +55,9 @@ class SystemPeerManager(manager.Manager): ) @staticmethod - def get_local_associations(ctx, peer, local_pg=None): + def get_local_associations( + ctx, peer: models.SystemPeer, local_pg: models.SubcloudPeerGroup = None + ) -> list[models.PeerGroupAssociation]: if local_pg is None: # Get associations by system peer id return db_api.peer_group_association_get_by_system_peer_id(ctx, peer.id) @@ -93,8 +96,12 @@ class SystemPeerManager(manager.Manager): """ def _update_association_on_peer_site( - peer, sync_status, local_pg, remote_pg, message - ): + peer: models.SystemPeer, + sync_status: str, + local_pg: models.SubcloudPeerGroup, + remote_pg: dict, + message: str, + ) -> tuple[str, str]: try: # Get peer site dcmanager client dc_client = SystemPeerManager.get_peer_dc_client(peer) @@ -118,21 +125,21 @@ class SystemPeerManager(manager.Manager): # Update peer site association sync_status only if the # sync_status is different from the current sync_status - if dc_peer_association.get("sync_status") != sync_status: + if dc_peer_association.get("sync-status") != sync_status: # Update peer site association sync_status dc_peer_association_id = dc_peer_association.get("id") dc_client.update_peer_group_association_sync_status( dc_peer_association_id, sync_status ) LOG.info( - f"Updated Peer site {dc_peer_system_peer.get('id')} " - f"Peer Group Association {dc_peer_association_id} " - f"sync_status to {sync_status}." + f"Updated peer site '{dc_peer_system_peer.get('peer-name')}' " + f"peer group association {dc_peer_association_id} " + f"sync_status to '{sync_status}'" ) except Exception as e: message = ( - f"Failed to Update Peer Site ({peer.peer_uuid}) " - f"Association sync_status to {sync_status}." + f"Failed to update peer site '{peer.peer_name}' " + f"association sync_status to '{sync_status}'" ) LOG.exception(f"{message} Error: {e}") sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED @@ -146,9 +153,9 @@ class SystemPeerManager(manager.Manager): for association in associations: if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: - LOG.debug( - f"Skip update Peer Site association sync_status to {sync_status} " - "as current site Association is not primary." + LOG.info( + "Skipping updating peer and local site association " + f"sync_status to '{sync_status}' as local site is not primary" ) continue @@ -165,13 +172,16 @@ class SystemPeerManager(manager.Manager): association.sync_status == sync_status and sync_status != consts.ASSOCIATION_SYNC_STATUS_FAILED ): - LOG.debug( - "Skip update current site association sync_status to " - f"{sync_status} as current site Association is already " - "in the same status." + LOG.info( + f"Local association {association.id} sync_status " + f"already set to '{sync_status}', skipping update" ) continue # Update primary site association sync_status + LOG.info( + f"Updating local association {association.id} sync_status " + f"from '{association.sync_status}' to '{sync_status}'" + ) db_api.peer_group_association_update( ctx, association.id, sync_status=sync_status, sync_message=message ) @@ -222,10 +232,11 @@ class SystemPeerManager(manager.Manager): dccommon_consts.SYSTEM_CONTROLLER_NAME, p_ks_client.session, endpoint=dc_endpoint, + peer=peer, ) @staticmethod - def get_peer_subcloud(dc_client, subcloud_name): + def get_peer_subcloud(dc_client: DcmanagerClient, subcloud_name: str) -> dict: """Get subcloud on peer site if exist. :param dc_client: the dcmanager client object @@ -235,7 +246,10 @@ class SystemPeerManager(manager.Manager): peer_subcloud = dc_client.get_subcloud(subcloud_name) return peer_subcloud except dccommon_exceptions.SubcloudNotFound: - LOG.warn(f"Subcloud {subcloud_name} does not exist on peer site.") + LOG.warn( + f"Subcloud '{subcloud_name}' does not exist on " + f"peer site '{dc_client.peer.peer_name}'" + ) @staticmethod def get_subcloud_deploy_status(subcloud): @@ -258,7 +272,9 @@ class SystemPeerManager(manager.Manager): return True @staticmethod - def delete_peer_secondary_subcloud(dc_client, subcloud_ref): + def delete_peer_secondary_subcloud( + dc_client: DcmanagerClient, subcloud_ref: str + ) -> None: """Delete secondary subcloud on peer site. :param dc_client: the dcmanager client object @@ -284,14 +300,17 @@ class SystemPeerManager(manager.Manager): return dc_client.delete_subcloud(subcloud_ref) - LOG.info(f"Deleted Subcloud {subcloud_ref} on peer site.") + LOG.info( + f"Deleted subcloud '{subcloud_ref}' on peer site " + f"'{dc_client.peer.peer_name}'" + ) @staticmethod def _run_parallel_group_operation(op_type, op_function, thread_pool, subclouds): """Run parallel group operation on subclouds.""" failed_subclouds = [] processed = 0 - error_msg = {} # Dictinary to store error message for each subcloud + error_msg = {} # Dictionary to store error message for each subcloud for subcloud, success in thread_pool.imap(op_function, subclouds): processed += 1 @@ -365,9 +384,9 @@ class SystemPeerManager(manager.Manager): region_name, files, data, is_region_name=True ) LOG.info( - f"Updated Subcloud {dc_peer_subcloud.get('name')} " + f"Updated subcloud '{dc_peer_subcloud.get('name')}' " f"(region_name: {dc_peer_subcloud.get('region-name')}) " - "on peer site." + f"on peer site '{dc_client.peer.peer_name}'" ) else: # Create subcloud on peer site if not exist @@ -375,13 +394,15 @@ class SystemPeerManager(manager.Manager): files, data ) LOG.info( - f"Created Subcloud {dc_peer_subcloud.get('name')} " + f"Created subcloud '{dc_peer_subcloud.get('name')}' " f"(region_name: {dc_peer_subcloud.get('region-name')}) " - "on peer site." + f"on peer site '{dc_client.peer.peer_name}'" ) - LOG.debug( - f"Updating subcloud {subcloud_name} (region_name: {region_name}) " - f"with subcloud peer group id {dc_peer_pg_id} on peer site." + LOG.info( + f"Updating subcloud '{subcloud_name}' (region_name: " + f"{dc_peer_subcloud.get('region-name')}) with subcloud " + f"peer group id {dc_peer_pg_id} on peer site " + f"'{dc_client.peer.peer_name}'" ) # Update subcloud associated peer group on peer site. # The peer_group update will check the header and should @@ -413,12 +434,15 @@ class SystemPeerManager(manager.Manager): except Exception as e: subcloud.msg = str(e) # Store error message for subcloud LOG.error( - f"Failed to add/update Subcloud {subcloud_name} " - f"(region_name: {region_name}) on peer site: {str(e)}" + f"Failed to add/update subcloud '{subcloud_name}' " + f"(region_name: {region_name}) on peer site " + f"'{dc_client.peer.peer_name}': {str(e)}" ) return subcloud, False - def _delete_subcloud(self, dc_client, subcloud): + def _delete_subcloud( + self, dc_client: DcmanagerClient, subcloud: models.Subcloud + ) -> tuple[models.Subcloud, bool]: """Delete subcloud on peer site in parallel.""" try: subcloud_name = subcloud.get("name") @@ -427,11 +451,12 @@ class SystemPeerManager(manager.Manager): except Exception as e: subcloud.msg = str(e) LOG.exception( - f"Failed to delete Subcloud {subcloud_name} on peer site: {str(e)}" + f"Failed to delete subcloud '{subcloud_name}' on " + f"peer site '{dc_client.peer.peer_name}': {str(e)}" ) return subcloud, False - def _is_valid_for_subcloud_sync(self, subcloud): + def _is_valid_for_subcloud_sync(self, subcloud: models.Subcloud) -> str: """Verify subcloud data for sync.""" subcloud_name = subcloud.get("name") region_name = subcloud.get("region_name") @@ -485,10 +510,12 @@ class SystemPeerManager(manager.Manager): return VERIFY_SUBCLOUD_SYNC_VALID - def _validate_subclouds_for_sync(self, subclouds, dc_client): + def _validate_subclouds_for_sync( + self, subclouds: list[models.Subcloud], dc_client: DcmanagerClient + ) -> tuple[list[models.Subcloud], dict[str, str]]: """Validate subclouds for sync.""" valid_subclouds = [] - error_msg = {} # Dictinary to store error message for each subcloud + error_msg = {} # Dictionary to store error message for each subcloud for subcloud in subclouds: subcloud_name = subcloud.get("name") @@ -510,8 +537,8 @@ class SystemPeerManager(manager.Manager): peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name) if not peer_subcloud: LOG.info( - f"Subcloud {subcloud_name} (region_name: {region_name}) does " - "not exist on peer site." + f"Subcloud '{subcloud_name}' (region_name: {region_name}) does " + f"not exist on peer site '{dc_client.peer.peer_name}'" ) valid_subclouds.append(subcloud) continue @@ -541,7 +568,9 @@ class SystemPeerManager(manager.Manager): return valid_subclouds, error_msg - def _sync_subclouds(self, context, peer, dc_local_pg_id, dc_peer_pg_id): + def _sync_subclouds( + self, context, peer: models.SystemPeer, dc_local_pg_id: int, dc_peer_pg_id: int + ) -> dict[str, str]: """Sync subclouds of local peer group to peer site. :param context: request context object @@ -572,7 +601,10 @@ class SystemPeerManager(manager.Manager): ) error_msg.update(sync_error_msg) - LOG.info("Subcloud peer sync operation finished") + LOG.info( + f"Completed subcloud peer sync with peer '{peer.peer_name}': processed " + f"{len(subclouds_to_sync)} subclouds, {len(failed_subclouds)} failed" + ) dc_local_region_names = set() for subcloud in subclouds: @@ -591,7 +623,10 @@ class SystemPeerManager(manager.Manager): dc_peer_subcloud_diff_names = dc_peer_region_names - dc_local_region_names for subcloud_to_delete in dc_peer_subcloud_diff_names: try: - LOG.debug(f"Deleting Subcloud name {subcloud_to_delete} on peer site.") + LOG.info( + f"Deleting subcloud '{subcloud_to_delete}' on " + f"peer site '{dc_client.peer.peer_name}'" + ) self.delete_peer_secondary_subcloud(dc_client, subcloud_to_delete) except Exception as e: msg = f"Subcloud delete failed: {str(e)}" @@ -664,7 +699,7 @@ class SystemPeerManager(manager.Manager): context, peer_group_id ) if not associations: - LOG.debug("No association found for peer group %s" % peer_group_id) + LOG.info(f"No association found for peer group '{peer_group_id}'") else: for association in associations: if sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: @@ -722,7 +757,7 @@ class SystemPeerManager(manager.Manager): sync_status=new_sync_status, sync_message=new_sync_message, ) - LOG.debug( + LOG.info( f"Updated Local Peer Group Association {association.id} " f"sync_status to {new_sync_status}." ) @@ -786,8 +821,11 @@ class SystemPeerManager(manager.Manager): return success_peer_ids, failed_peer_ids def _get_non_primary_association( - self, dc_client, dc_peer_system_peer_id, dc_peer_pg_id - ): + self, + dc_client: DcmanagerClient, + dc_peer_system_peer_id: int, + dc_peer_pg_id: int, + ) -> Optional[dict]: """Get non-primary Association from peer site.""" try: return dc_client.get_peer_group_association_with_peer_id_and_pg_id( @@ -795,18 +833,23 @@ class SystemPeerManager(manager.Manager): ) except dccommon_exceptions.PeerGroupAssociationNotFound: LOG.error( - "Peer Group association does not exist on peer site. " - f"Peer Group ID: {dc_peer_pg_id}, Peer System Peer ID: " - f"{dc_peer_system_peer_id}" + "Peer group association does not exist on peer site " + f"'{dc_client.peer.peer_name}'. Peer group ID: {dc_peer_pg_id}, " + f"peer system peer ID: {dc_peer_system_peer_id}" ) return None - def _get_peer_site_pg_by_name(self, dc_client, peer_group_name): + def _get_peer_site_pg_by_name( + self, dc_client: DcmanagerClient, peer_group_name: str + ) -> Optional[dict]: """Get remote Peer Group from peer site by name.""" try: return dc_client.get_subcloud_peer_group(peer_group_name) except dccommon_exceptions.SubcloudPeerGroupNotFound: - LOG.error(f"Peer Group {peer_group_name} does not exist on peer site.") + LOG.error( + f"Peer group '{peer_group_name}' does not exist on " + f"peer site '{dc_client.peer.peer_name}'" + ) return None def _get_peer_site_system_peer(self, dc_client, peer_uuid=None): @@ -838,11 +881,6 @@ class SystemPeerManager(manager.Manager): :param association_id: id of association to sync :param sync_subclouds: Enabled to sync subclouds to peer site """ - LOG.info( - f"Synchronize the association {association_id} of the " - "Subcloud Peer Group with the System Peer pointing to the " - "peer site." - ) association = db_api.peer_group_association_get(context, association_id) peer = db_api.system_peer_get(context, association.system_peer_id) @@ -850,14 +888,19 @@ class SystemPeerManager(manager.Manager): peer_group_name = dc_local_pg.peer_group_name dc_peer_association_id = None + LOG.info( + f"Syncing peer group association {association_id} that links subcloud " + f"peer group '{peer_group_name}' with system peer '{peer.peer_name}'" + ) + try: # Check if the system_uuid of the peer site matches with the # peer_uuid system = self.get_peer_sysinv_client(peer).get_system() if system.uuid != peer.peer_uuid: LOG.error( - f"Peer site system uuid {system.uuid} does not match " - f"with the peer_uuid {peer.peer_uuid}" + f"Peer site '{peer.peer_name}' system uuid {system.uuid} does " + f"not match the configured peer_uuid: {peer.peer_uuid}" ) raise exceptions.PeerGroupAssociationTargetNotMatch(uuid=system.uuid) @@ -873,7 +916,8 @@ class SystemPeerManager(manager.Manager): if dc_peer_system_peer is None: failed_message = ( - f"System Peer {local_system_uuid} does not exist on peer site." + f"A system peer with UUID of {local_system_uuid} " + f"does not exist on peer site '{peer.peer_name}'" ) return db_api.peer_group_association_db_model_to_dict( self._update_sync_status_to_failed( @@ -896,7 +940,7 @@ class SystemPeerManager(manager.Manager): dc_peer_pg = dc_client.add_subcloud_peer_group(**peer_group_kwargs) LOG.info( f"Created Subcloud Peer Group {peer_group_name} on " - f"peer site. ID is {dc_peer_pg.get('id')}." + f"peer site '{peer.peer_name}'. ID is {dc_peer_pg.get('id')}." ) dc_peer_pg_id = dc_peer_pg.get("id") dc_peer_pg_priority = dc_peer_pg.get("group_priority") @@ -921,7 +965,7 @@ class SystemPeerManager(manager.Manager): ) LOG.info( "Created 'non-primary' Peer Group Association " - f"{dc_peer_association.get('id')} on peer site." + f"{dc_peer_association.get('id')} on peer site '{peer.peer_name}'" ) dc_peer_association_id = dc_peer_association.get("id") @@ -943,7 +987,7 @@ class SystemPeerManager(manager.Manager): ) LOG.info( f"Updated Subcloud Peer Group {peer_group_name} on " - f"peer site, ID is {dc_peer_pg.get('id')}." + f"peer site '{peer.peer_name}', ID is {dc_peer_pg.get('id')}" ) association_update = { @@ -998,7 +1042,6 @@ class SystemPeerManager(manager.Manager): :param context: request context object. :param association_id: id of association to delete """ - LOG.info(f"Deleting association peer group {association_id}.") # Retrieve the peer group association details from the database association = db_api.peer_group_association_get(context, association_id) @@ -1006,6 +1049,11 @@ class SystemPeerManager(manager.Manager): dc_local_pg = db_api.subcloud_peer_group_get(context, association.peer_group_id) peer_group_name = dc_local_pg.peer_group_name + LOG.info( + f"Deleting peer group association ({association_id=}, " + f"peer={peer.peer_name}, peer_group_id={dc_local_pg.id})" + ) + try: # Check if the system_uuid of the peer site matches with the # peer_uuid @@ -1035,7 +1083,7 @@ class SystemPeerManager(manager.Manager): # be deleted LOG.warning( f"Subcloud Peer Group {peer_group_name} does " - "not exist on peer site." + f"not exist on peer site '{peer.peer_name}'" ) return self._delete_primary_association(context, association_id) @@ -1045,7 +1093,7 @@ class SystemPeerManager(manager.Manager): if dc_peer_pg_priority == 0: LOG.error( f"Failed to delete peer_group_association. Peer Group: " - f"{peer_group_name} has priority 0 on peer site." + f"{peer_group_name} has priority 0 on peer site '{peer.peer_name}'" ) raise exceptions.SubcloudPeerGroupHasWrongPriority( priority=dc_peer_pg_priority @@ -1064,11 +1112,13 @@ class SystemPeerManager(manager.Manager): if delete_error_msg: LOG.error( "Failed to delete subcloud(s) from the Subcloud Peer Group " - f"{peer_group_name} on peer site: {json.dumps(delete_error_msg)}" + f"'{peer_group_name}' on peer site '{peer.peer_name}': " + f"{json.dumps(delete_error_msg)}" ) sync_message = ( f"Deletion of {list(delete_error_msg.keys())} from the " - f"Subcloud Peer Group: {peer_group_name} on the peer site failed." + f"Subcloud Peer Group: {peer_group_name} on the peer site " + f"'{peer.peer_name}' failed." ) self._update_sync_status_to_failed( context, association_id, sync_message @@ -1080,12 +1130,14 @@ class SystemPeerManager(manager.Manager): try: dc_client.delete_subcloud_peer_group(peer_group_name) LOG.info( - f"Deleted Subcloud Peer Group {peer_group_name} on peer site." + f"Deleted Subcloud Peer Group {peer_group_name} on " + f"peer site '{peer.peer_name}'" ) except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated: LOG.error( f"Subcloud Peer Group {peer_group_name} delete failed " - "as it is associated with System Peer on peer site." + f"as it is associated with System Peer on peer site " + f"'{peer.peer_name}'" ) return self._delete_primary_association(context, association_id) dc_peer_system_peer_id = dc_peer_system_peer.get("id") @@ -1100,18 +1152,21 @@ class SystemPeerManager(manager.Manager): dc_client.delete_peer_group_association(dc_peer_association_id) elif dc_peer_association is None: LOG.warning( - f"PeerGroupAssociation does not exist on peer site. " - f"Peer Group ID: {dc_peer_pg_id}, peer site System " - f"Peer ID: {dc_peer_system_peer_id}" + f"PeerGroupAssociation does not exist on peer site " + f"'{peer.peer_name}'. Peer Group ID: {dc_peer_pg_id}, " + f"peer site System Peer ID: {dc_peer_system_peer_id}" ) try: dc_client.delete_subcloud_peer_group(peer_group_name) - LOG.info(f"Deleted Subcloud Peer Group {peer_group_name} on peer site.") + LOG.info( + f"Deleted Subcloud Peer Group {peer_group_name} on " + f"peer site '{peer.peer_name}'" + ) except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated: failed_message = ( f"Subcloud Peer Group {peer_group_name} delete failed as it " - "is associated with system peer on peer site." + f"is associated with system peer on peer site '{peer.peer_name}'" ) self._update_sync_status_to_failed( context, association_id, failed_message diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_peer_monitor_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_peer_monitor_manager.py index 7ee2f2c36..a2afa986c 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_peer_monitor_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_peer_monitor_manager.py @@ -107,26 +107,57 @@ class TestPeerMonitor(base.DCManagerTestCase): ) def test_update_sync_status_and_association_is_non_primary(self): + # Delete the primary association created during setUp + db_api.peer_group_association_destroy(self.ctx, self.association.id) + + # Create a non-primary association association = self.system_peer_manager.create_peer_group_association_static( self.ctx, system_peer_id=self.peer.id, peer_group_id=self.peer_group2.id, association_type=consts.ASSOCIATION_TYPE_NON_PRIMARY, + sync_status=consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, ) - self.mock_get_peer_dc_client().get_subcloud_peer_group.return_value = { - "id": FAKE_SITE1_PEER_GROUP_ID - } - # Test the case where the association is non-primary - self.peer_monitor._update_sync_status_secondary_site_becomes_reachable() - self.mock_get_peer_dc_client().get_subcloud_peer_group.assert_called_once_with( - self.peer_group1.peer_group_name + # Mock update_sync_status + mock_patch = mock.patch.object( + peer_monitor_manager.SystemPeerManager, "update_sync_status" ) + mock_update_sync_status = mock_patch.start() + self.addCleanup(mock_patch.stop) + + self.peer_monitor._update_sync_status_secondary_site_becomes_reachable() + + # Assert that the association sync status was not updated + mock_update_sync_status.assert_not_called() association_new = db_api.peer_group_association_get(self.ctx, association.id) + self.assertEqual( + consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, association_new.sync_status + ) + + def test_update_sync_status_and_association_is_in_sync(self): + # Mock update_sync_status + mock_patch = mock.patch.object( + peer_monitor_manager.SystemPeerManager, "update_sync_status" + ) + mock_update_sync_status = mock_patch.start() + self.addCleanup(mock_patch.stop) + + self.peer_monitor._update_sync_status_secondary_site_becomes_reachable() + + # Assert that the association sync status was not updated as it's + # already in-sync + association_new = db_api.peer_group_association_get( + self.ctx, self.association.id + ) self.assertEqual( consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, association_new.sync_status ) + # But the update_sync_status still has to be triggered so it handles any + # sync message update + mock_update_sync_status.assert_called_once() + def test_update_sync_status_secondary_site_becomes_reachable(self): self.mock_get_local_system.return_value = test_system_peer_manager.FakeSystem( FAKE_SITE0_SYSTEM_UUID @@ -286,7 +317,7 @@ class TestPeerMonitor(base.DCManagerTestCase): mock_event.side_effect = [False, False, True] self.peer_monitor._do_monitor_peer() self.mock_log.exception.assert_called_with( - f"Problem clearing fault for peer {self.peer.peer_uuid}, alarm_id=" + f"Problem clearing fault for peer {self.peer.peer_name}, alarm_id=" f"{fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED} error: boom" ) @@ -319,7 +350,7 @@ class TestPeerMonitor(base.DCManagerTestCase): mock_event.side_effect = [False, False, True] self.peer_monitor._do_monitor_peer() self.mock_log.exception.assert_called_with( - "Got exception monitoring peer PeerSite1 error: boom" + "Unexpected error monitoring peer 'PeerSite1': boom" ) def test_heartbeat_check_via_get_peer_group_list_pg_not_found(self): @@ -327,8 +358,8 @@ class TestPeerMonitor(base.DCManagerTestCase): ret = self.peer_monitor._heartbeat_check_via_get_peer_group_list() self.mock_get_peer_dc_client.assert_called() self.mock_log.warning.assert_called_once_with( - "Resource subcloud peer group of dc:" - "http://128.128.128.128:5000/v3 not found" + "No subcloud peer groups found for DC peer: PeerSite1 " + "(endpoint: http://128.128.128.128:5000/v3)" ) self.assertEqual((False, []), ret) diff --git a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py index c4924a5ce..02b953cb7 100644 --- a/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py +++ b/distributedcloud/dcmanager/tests/unit/manager/test_system_peer_manager.py @@ -143,6 +143,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): mock_patch = mock.patch.object(system_peer_manager, "DcmanagerClient") self.mock_dc_client = mock_patch.start() + self.mock_dc_client().peer.peer_name = "SystemPeer1" self.addCleanup(mock_patch.stop) def _mock_system_peer_manager_peersitedriver(self): @@ -474,7 +475,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): "dcmanager.manager.system_peer_manager.SystemPeerManager.get_peer_dc_client" ) def test_update_sync_status_exception(self, mock_client): - mock_client.return_value = Exception("boom") + mock_client.side_effect = Exception("boom") self.spm.update_sync_status( self.ctx, self.peer, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC ) @@ -673,8 +674,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.ctx, self.peer, self.peer_group.id, FAKE_SITE1_PEER_GROUP_ID ) self.mock_log.error.assert_called_once_with( - f"Failed to add/update Subcloud {subcloud1.name} " - f"(region_name: {subcloud1.region_name}) on peer site: boom" + f"Failed to add/update subcloud '{subcloud1.name}' " + f"(region_name: {subcloud1.region_name}) on peer site 'SystemPeer1': boom" ) def test_sync_subclouds_delete_subcloud_exception(self): @@ -743,8 +744,11 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.mock_dc_client().get_subcloud.side_effect = [peer_subcloud4] self.peer_group_association.return_value = {"id": FAKE_SITE1_ASSOCIATION_ID} self.spm.delete_peer_group_association(self.ctx, self.association.id) - Calls = [ - mock.call("Deleting association peer group 1."), + calls = [ + mock.call( + "Deleting peer group association (association_id=1, " + "peer=SystemPeer1, peer_group_id=1)" + ), mock.call( f"Ignoring delete Peer Site Subcloud {subcloud.name} as " "is not in secondary or rehome failed state." @@ -755,10 +759,10 @@ class TestSystemPeerManager(base.DCManagerTestCase): ), mock.call( f"Deleted Subcloud Peer Group {self.peer_group.peer_group_name} " - "on peer site." + "on peer site 'SystemPeer1'" ), ] - self.mock_log.info.assert_has_calls(Calls) + self.mock_log.info.assert_has_calls(calls) def test_delete_peer_group_association_uuid_does_not_match(self): peer = self.create_system_peer_static( @@ -806,7 +810,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): associations = db_api.peer_group_association_get_all(self.ctx) self.assertEqual(1, len(associations)) self.mock_log.exception.assert_called_once_with( - "Failed to delete Subcloud subcloud1 on peer site: boom" + "Failed to delete subcloud 'subcloud1' on peer site 'SystemPeer1': boom" ) def test_delete_peer_group_association_failed(self): @@ -820,8 +824,8 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.association.id, ) self.mock_log.error.assert_called_once_with( - f"Subcloud Peer Group {self.peer_group.peer_group_name} " - "delete failed as it is associated with system peer on peer site." + f"Subcloud Peer Group {self.peer_group.peer_group_name} delete " + "failed as it is associated with system peer on peer site 'SystemPeer1'" ) def test_delete_peer_group_association_subcloud_pg_notfound(self): @@ -831,7 +835,7 @@ class TestSystemPeerManager(base.DCManagerTestCase): self.spm.delete_peer_group_association(self.ctx, self.association.id) self.mock_log.warning.assert_called_once_with( f"Subcloud Peer Group {self.peer_group.peer_group_name} " - "does not exist on peer site." + "does not exist on peer site 'SystemPeer1'" ) associations = db_api.peer_group_association_get_all(self.ctx) self.assertEqual(0, len(associations))