Merge "Fix DC geo-redundancy systemcontroller_gateway_address syncing"

This commit is contained in:
Zuul 2024-11-28 21:41:59 +00:00 committed by Gerrit Code Review
commit 59924988cd
10 changed files with 473 additions and 226 deletions

View File

@ -10,6 +10,7 @@ from requests_toolbelt import MultipartEncoder
from dccommon import consts from dccommon import consts
from dccommon.drivers import base from dccommon.drivers import base
from dccommon import exceptions from dccommon import exceptions
from dcmanager.db.sqlalchemy import models
LOG = log.getLogger(__name__) LOG = log.getLogger(__name__)
@ -27,6 +28,7 @@ class DcmanagerClient(base.DriverBase):
timeout=DCMANAGER_CLIENT_REST_DEFAULT_TIMEOUT, timeout=DCMANAGER_CLIENT_REST_DEFAULT_TIMEOUT,
endpoint_type=consts.KS_ENDPOINT_PUBLIC, endpoint_type=consts.KS_ENDPOINT_PUBLIC,
endpoint=None, endpoint=None,
peer: models.SystemPeer = None,
): ):
if endpoint is None: if endpoint is None:
endpoint = session.get_endpoint( endpoint = session.get_endpoint(
@ -35,6 +37,7 @@ class DcmanagerClient(base.DriverBase):
self.endpoint = endpoint self.endpoint = endpoint
self.timeout = timeout self.timeout = timeout
self.session = session self.session = session
self.peer = peer
def get_system_peer(self, system_peer_uuid): def get_system_peer(self, system_peer_uuid):
"""Get system peer.""" """Get system peer."""

View File

@ -1076,8 +1076,11 @@ class SubcloudsController(object):
systemcontroller_gateway_address.split(",")[0] systemcontroller_gateway_address.split(",")[0]
!= subcloud.systemcontroller_gateway_ip != subcloud.systemcontroller_gateway_ip
): ):
# Pass bootstrap_values_dict for patch operations where these
# values aren't in payload, unlike subcloud add where they are.
# Function needs management/admin_subnet for validation
psd_common.validate_systemcontroller_gateway_address( psd_common.validate_systemcontroller_gateway_address(
systemcontroller_gateway_address, payload systemcontroller_gateway_address, bootstrap_values_dict
) )
management_state = payload.get("management-state") management_state = payload.get("management-state")

View File

@ -541,27 +541,27 @@ def system_peer_create(
) )
def system_peer_get(context, peer_id): def system_peer_get(context, peer_id) -> models.SystemPeer:
"""Retrieve a system_peer or raise if it does not exist.""" """Retrieve a system_peer or raise if it does not exist."""
return IMPL.system_peer_get(context, peer_id) return IMPL.system_peer_get(context, peer_id)
def system_peer_get_by_uuid(context, uuid): def system_peer_get_by_uuid(context, uuid) -> models.SystemPeer:
"""Retrieve a system_peer by uuid or raise if it does not exist.""" """Retrieve a system_peer by uuid or raise if it does not exist."""
return IMPL.system_peer_get_by_uuid(context, uuid) return IMPL.system_peer_get_by_uuid(context, uuid)
def system_peer_get_by_name(context, uuid): def system_peer_get_by_name(context, uuid) -> models.SystemPeer:
"""Retrieve a system_peer by name or raise if it does not exist.""" """Retrieve a system_peer by name or raise if it does not exist."""
return IMPL.system_peer_get_by_name(context, uuid) return IMPL.system_peer_get_by_name(context, uuid)
def system_peer_get_all(context): def system_peer_get_all(context) -> list[models.SystemPeer]:
"""Retrieve all system peers.""" """Retrieve all system peers."""
return IMPL.system_peer_get_all(context) return IMPL.system_peer_get_all(context)
def peer_group_get_for_system_peer(context, peer_id): def peer_group_get_for_system_peer(context, peer_id) -> list[models.SubcloudPeerGroup]:
"""Get subcloud peer groups associated with a system peer.""" """Get subcloud peer groups associated with a system peer."""
return IMPL.peer_group_get_for_system_peer(context, peer_id) return IMPL.peer_group_get_for_system_peer(context, peer_id)
@ -656,22 +656,24 @@ def subcloud_peer_group_destroy(context, group_id):
return IMPL.subcloud_peer_group_destroy(context, group_id) return IMPL.subcloud_peer_group_destroy(context, group_id)
def subcloud_peer_group_get(context, group_id): def subcloud_peer_group_get(context, group_id) -> models.SubcloudPeerGroup:
"""Retrieve a subcloud_peer_group or raise if it does not exist.""" """Retrieve a subcloud_peer_group or raise if it does not exist."""
return IMPL.subcloud_peer_group_get(context, group_id) return IMPL.subcloud_peer_group_get(context, group_id)
def subcloud_peer_group_get_by_name(context, name): def subcloud_peer_group_get_by_name(context, name) -> models.SubcloudPeerGroup:
"""Retrieve a subcloud_peer_group by name or raise if it does not exist.""" """Retrieve a subcloud_peer_group by name or raise if it does not exist."""
return IMPL.subcloud_peer_group_get_by_name(context, name) return IMPL.subcloud_peer_group_get_by_name(context, name)
def subcloud_peer_group_get_by_leader_id(context, system_leader_id): def subcloud_peer_group_get_by_leader_id(
context, system_leader_id
) -> list[models.SubcloudPeerGroup]:
"""Retrieve subcloud peer groups by system_leader_id.""" """Retrieve subcloud peer groups by system_leader_id."""
return IMPL.subcloud_peer_group_get_by_leader_id(context, system_leader_id) return IMPL.subcloud_peer_group_get_by_leader_id(context, system_leader_id)
def subcloud_get_for_peer_group(context, group_id): def subcloud_get_for_peer_group(context, group_id) -> list[models.Subcloud]:
"""Retrieve all subclouds belonging to a subcloud_peer_group """Retrieve all subclouds belonging to a subcloud_peer_group
or raise if it does not exist. or raise if it does not exist.
@ -679,7 +681,7 @@ def subcloud_get_for_peer_group(context, group_id):
return IMPL.subcloud_get_for_peer_group(context, group_id) return IMPL.subcloud_get_for_peer_group(context, group_id)
def subcloud_peer_group_get_all(context): def subcloud_peer_group_get_all(context) -> list[models.SubcloudPeerGroup]:
"""Retrieve all subcloud peer groups.""" """Retrieve all subcloud peer groups."""
return IMPL.subcloud_peer_group_get_all(context) return IMPL.subcloud_peer_group_get_all(context)
@ -765,31 +767,35 @@ def peer_group_association_destroy(context, id):
return IMPL.peer_group_association_destroy(context, id) return IMPL.peer_group_association_destroy(context, id)
def peer_group_association_get(context, id): def peer_group_association_get(context, id) -> models.PeerGroupAssociation:
"""Retrieve a peer_group_association or raise if it does not exist.""" """Retrieve a peer_group_association or raise if it does not exist."""
return IMPL.peer_group_association_get(context, id) return IMPL.peer_group_association_get(context, id)
def peer_group_association_get_all(context): def peer_group_association_get_all(context) -> list[models.PeerGroupAssociation]:
"""Retrieve all peer_group_associations.""" """Retrieve all peer_group_associations."""
return IMPL.peer_group_association_get_all(context) return IMPL.peer_group_association_get_all(context)
def peer_group_association_get_by_peer_group_and_system_peer_id( def peer_group_association_get_by_peer_group_and_system_peer_id(
context, peer_group_id, system_peer_id context, peer_group_id, system_peer_id
): ) -> list[models.PeerGroupAssociation]:
"""Get peer group associations by peer_group_id and system_peer_id.""" """Get peer group associations by peer_group_id and system_peer_id."""
return IMPL.peer_group_association_get_by_peer_group_and_system_peer_id( return IMPL.peer_group_association_get_by_peer_group_and_system_peer_id(
context, peer_group_id, system_peer_id context, peer_group_id, system_peer_id
) )
def peer_group_association_get_by_peer_group_id(context, peer_group_id): def peer_group_association_get_by_peer_group_id(
context, peer_group_id
) -> list[models.PeerGroupAssociation]:
"""Get the peer_group_association list by peer_group_id""" """Get the peer_group_association list by peer_group_id"""
return IMPL.peer_group_association_get_by_peer_group_id(context, peer_group_id) return IMPL.peer_group_association_get_by_peer_group_id(context, peer_group_id)
def peer_group_association_get_by_system_peer_id(context, system_peer_id): def peer_group_association_get_by_system_peer_id(
context, system_peer_id
) -> list[models.PeerGroupAssociation]:
"""Get the peer_group_association list by system_peer_id""" """Get the peer_group_association list by system_peer_id"""
return IMPL.peer_group_association_get_by_system_peer_id(context, system_peer_id) return IMPL.peer_group_association_get_by_system_peer_id(context, system_peer_id)

View File

@ -1647,7 +1647,7 @@ def peer_group_association_destroy(context, association_id):
@require_context @require_context
def peer_group_association_get(context, association_id): def peer_group_association_get(context, association_id) -> models.PeerGroupAssociation:
try: try:
result = ( result = (
model_query(context, models.PeerGroupAssociation) model_query(context, models.PeerGroupAssociation)
@ -1666,7 +1666,7 @@ def peer_group_association_get(context, association_id):
@require_context @require_context
def peer_group_association_get_all(context): def peer_group_association_get_all(context) -> list[models.PeerGroupAssociation]:
result = ( result = (
model_query(context, models.PeerGroupAssociation) model_query(context, models.PeerGroupAssociation)
.filter_by(deleted=0) .filter_by(deleted=0)
@ -1682,7 +1682,7 @@ def peer_group_association_get_all(context):
@require_context @require_context
def peer_group_association_get_by_peer_group_and_system_peer_id( def peer_group_association_get_by_peer_group_and_system_peer_id(
context, peer_group_id, system_peer_id context, peer_group_id, system_peer_id
): ) -> models.PeerGroupAssociation:
try: try:
result = ( result = (
model_query(context, models.PeerGroupAssociation) model_query(context, models.PeerGroupAssociation)
@ -1705,7 +1705,9 @@ def peer_group_association_get_by_peer_group_and_system_peer_id(
@require_context @require_context
def peer_group_association_get_by_peer_group_id(context, peer_group_id): def peer_group_association_get_by_peer_group_id(
context, peer_group_id
) -> models.PeerGroupAssociation:
result = ( result = (
model_query(context, models.PeerGroupAssociation) model_query(context, models.PeerGroupAssociation)
.filter_by(deleted=0) .filter_by(deleted=0)
@ -1718,7 +1720,9 @@ def peer_group_association_get_by_peer_group_id(context, peer_group_id):
@require_context @require_context
def peer_group_association_get_by_system_peer_id(context, system_peer_id): def peer_group_association_get_by_system_peer_id(
context, system_peer_id
) -> models.PeerGroupAssociation:
result = ( result = (
model_query(context, models.PeerGroupAssociation) model_query(context, models.PeerGroupAssociation)
.filter_by(deleted=0) .filter_by(deleted=0)

View File

@ -3,8 +3,10 @@
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #
from __future__ import annotations
import json
import threading import threading
from typing import TYPE_CHECKING
from fm_api import constants as fm_const from fm_api import constants as fm_const
from fm_api import fm_api from fm_api import fm_api
@ -12,14 +14,20 @@ from oslo_config import cfg
from oslo_log import log as logging from oslo_log import log as logging
from dccommon import consts as dccommon_consts from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.dcmanager_v1 import DcmanagerClient
from dcmanager.common import consts from dcmanager.common import consts
from dcmanager.common import context from dcmanager.common import context
from dcmanager.common.i18n import _ from dcmanager.common.i18n import _
from dcmanager.common import manager from dcmanager.common import manager
from dcmanager.common import utils from dcmanager.common import utils
from dcmanager.db import api as db_api from dcmanager.db import api as db_api
from dcmanager.db.sqlalchemy import models
from dcmanager.manager.system_peer_manager import SystemPeerManager from dcmanager.manager.system_peer_manager import SystemPeerManager
# Use TYPE_CHECKING to avoid circular import
if TYPE_CHECKING:
from dcmanager.manager.subcloud_manager import SubcloudManager
CONF = cfg.CONF CONF = cfg.CONF
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
@ -27,7 +35,9 @@ LOG = logging.getLogger(__name__)
class PeerGroupAuditManager(manager.Manager): class PeerGroupAuditManager(manager.Manager):
"""Manages audit related tasks.""" """Manages audit related tasks."""
def __init__(self, subcloud_manager, peer_group_id, *args, **kwargs): def __init__(
self, subcloud_manager: SubcloudManager, peer_group_id: int, *args, **kwargs
):
LOG.debug(_("PeerGroupAuditManager initialization...")) LOG.debug(_("PeerGroupAuditManager initialization..."))
super().__init__(service_name="peer_group_audit_manager", *args, **kwargs) super().__init__(service_name="peer_group_audit_manager", *args, **kwargs)
self.context = context.get_admin_context() self.context = context.get_admin_context()
@ -52,8 +62,8 @@ class PeerGroupAuditManager(manager.Manager):
@staticmethod @staticmethod
def _get_association_sync_status_from_peer_site( def _get_association_sync_status_from_peer_site(
dc_client, system_peer, peer_group_id dc_client: DcmanagerClient, system_peer: models.SystemPeer, peer_group_id: int
): ) -> str:
try: try:
# Get peer site system peer # Get peer site system peer
dc_peer_system_peer = dc_client.get_system_peer( dc_peer_system_peer = dc_client.get_system_peer(
@ -81,8 +91,11 @@ class PeerGroupAuditManager(manager.Manager):
) )
def _get_local_subclouds_to_update_and_delete( def _get_local_subclouds_to_update_and_delete(
self, local_peer_group, remote_subclouds, remote_sync_status self,
): local_peer_group: models.SubcloudPeerGroup,
remote_subclouds: list[dict],
remote_sync_status: str,
) -> tuple[list[models.Subcloud], list[models.Subcloud], bool]:
local_subclouds_to_update = list() local_subclouds_to_update = list()
local_subclouds_to_delete = list() local_subclouds_to_delete = list()
any_rehome_failed = False any_rehome_failed = False
@ -112,8 +125,13 @@ class PeerGroupAuditManager(manager.Manager):
# indicating any bootstrap values/address updates to # indicating any bootstrap values/address updates to
# the subcloud on the remote site. # the subcloud on the remote site.
if remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: if remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC:
LOG.info(
"Peer association is out-of-sync, syncing rehome "
f"data of subcloud '{local_subcloud.name}' from "
"peer to current site"
)
self._sync_rehome_data( self._sync_rehome_data(
local_subcloud.id, remote_subcloud.get("rehome_data") local_subcloud, remote_subcloud.get("rehome_data")
) )
elif remote_subcloud.get("deploy-status") in ( elif remote_subcloud.get("deploy-status") in (
consts.DEPLOY_STATE_REHOME_FAILED, consts.DEPLOY_STATE_REHOME_FAILED,
@ -134,7 +152,7 @@ class PeerGroupAuditManager(manager.Manager):
return local_subclouds_to_update, local_subclouds_to_delete, any_rehome_failed return local_subclouds_to_update, local_subclouds_to_delete, any_rehome_failed
def _set_local_subcloud_to_secondary(self, subcloud): def _set_local_subcloud_to_secondary(self, subcloud: models.Subcloud) -> None:
try: try:
LOG.info("Set local subcloud %s to secondary" % subcloud.name) LOG.info("Set local subcloud %s to secondary" % subcloud.name)
# There will be an exception when unmanage # There will be an exception when unmanage
@ -155,18 +173,42 @@ class PeerGroupAuditManager(manager.Manager):
) )
raise e raise e
def _sync_rehome_data(self, subcloud_id, rehome_data): def _sync_rehome_data(self, subcloud: models.Subcloud, rehome_data: str) -> None:
db_api.subcloud_update(self.context, subcloud_id, rehome_data=rehome_data) try:
remote_rehome_data = json.loads(rehome_data)
local_rehome_data = json.loads(subcloud.rehome_data)
def audit(self, system_peer, remote_peer_group, local_peer_group): # The systemcontroller_gateway_address can't be synced from the
# peer to the local site as it's specific to each site
remote_rehome_data["saved_payload"]["systemcontroller_gateway_address"] = (
local_rehome_data["saved_payload"]["systemcontroller_gateway_address"]
)
new_rehome_data = json.dumps(remote_rehome_data)
db_api.subcloud_update(
self.context, subcloud.id, rehome_data=new_rehome_data
)
except Exception as e:
LOG.error(
"Unable to sync rehome data of subcloud "
f"'{subcloud.name}' from peer to current site: {str(e)}"
)
raise
def audit(
self,
system_peer: models.SystemPeer,
remote_peer_group: dict,
local_peer_group: models.SubcloudPeerGroup,
) -> None:
if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING: if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
LOG.info("Local peer group in migrating state, quit audit") LOG.info("Local peer group in migrating state, quit audit")
return return
LOG.info( LOG.info(
"Auditing remote subcloud peer group:[%s] migration_status:[%s] " "Auditing remote subcloud peer group: [%s], migration_status: [%s], "
"group_priority[%s], local subcloud peer group:[%s] " "group_priority: [%s], local subcloud peer group: [%s], "
"migration_status:[%s] group_priority[%s]" "migration_status: [%s], group_priority: [%s]"
% ( % (
remote_peer_group.get("peer_group_name"), remote_peer_group.get("peer_group_name"),
remote_peer_group.get("migration_status"), remote_peer_group.get("migration_status"),
@ -277,8 +319,8 @@ class PeerGroupAuditManager(manager.Manager):
) )
LOG.exception( LOG.exception(
f"Failed to delete local subcloud [{subcloud.name}] that does " f"Failed to delete local subcloud [{subcloud.name}] that does "
"not exist under the same subcloud_peer_group on peer site, " "not exist under the same subcloud_peer_group on peer site "
f"err: {e}" f"{system_peer.peer_name}, err: {e}"
) )
raise e raise e
@ -313,7 +355,12 @@ class PeerGroupAuditManager(manager.Manager):
# If remote peer group migration_status is 'None' # If remote peer group migration_status is 'None'
self.require_audit_flag = False self.require_audit_flag = False
def _clear_or_raise_alarm(self, system_peer, local_peer_group, remote_peer_group): def _clear_or_raise_alarm(
self,
system_peer: models.SystemPeer,
local_peer_group: models.SubcloudPeerGroup,
remote_peer_group: dict,
) -> None:
# If local subcloud peer group's group_priority is # If local subcloud peer group's group_priority is
# lower than remote subcloud peer group's group_priority, # lower than remote subcloud peer group's group_priority,
# an alarm will be raised. # an alarm will be raised.
@ -325,7 +372,7 @@ class PeerGroupAuditManager(manager.Manager):
if local_peer_group.group_priority < remote_peer_group.get("group_priority"): if local_peer_group.group_priority < remote_peer_group.get("group_priority"):
LOG.warning( LOG.warning(
f"Alarm: local subcloud peer group [{local_peer_group.peer_group_name}]" f"Alarm: local subcloud peer group [{local_peer_group.peer_group_name}]"
f" is managed by remote system [{system_peer.peer_name}]" f" is managed by remote system peer [{system_peer.peer_name}]"
) )
try: try:
fault = fm_api.Fault( fault = fm_api.Fault(
@ -336,7 +383,7 @@ class PeerGroupAuditManager(manager.Manager):
severity=fm_const.FM_ALARM_SEVERITY_MAJOR, severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
reason_text=( reason_text=(
"Subcloud peer group (peer_group_name=%s) is managed by " "Subcloud peer group (peer_group_name=%s) is managed by "
"remote system (peer_uuid=%s) with a lower priority." "remote system peer (peer_uuid=%s) with a lower priority."
% (local_peer_group.peer_group_name, system_peer.peer_uuid) % (local_peer_group.peer_group_name, system_peer.peer_uuid)
), ),
alarm_type=fm_const.FM_ALARM_TYPE_0, alarm_type=fm_const.FM_ALARM_TYPE_0,
@ -375,12 +422,12 @@ class PeerGroupAuditManager(manager.Manager):
try: try:
self.audit(system_peer, remote_peer_group, local_peer_group) self.audit(system_peer, remote_peer_group, local_peer_group)
except Exception as e: except Exception as e:
LOG.exception("audit error occured: %s" % e) LOG.exception("audit error occurred: %s" % e)
def stop(self): def stop(self):
if self.thread: if self.thread:
self.thread.join() self.thread.join()
LOG.info(f"stopped peer group {self.peer_group_id} audit thread") LOG.info(f"Stopped peer group {self.peer_group_id} audit thread")
else: else:
LOG.info(f"No peer group {self.peer_group_id} audit thread to stop") LOG.info(f"No peer group {self.peer_group_id} audit thread to stop")
@ -402,12 +449,14 @@ class PeerGroupAuditManager(manager.Manager):
): ):
LOG.info( LOG.info(
f"Audit peer group [{local_peer_group.peer_group_name}] " f"Audit peer group [{local_peer_group.peer_group_name}] "
f"with remote system {system_peer.peer_name}" f"with remote system peer {system_peer.peer_name}"
) )
self.start(system_peer, remote_peer_group, local_peer_group) self.start(system_peer, remote_peer_group, local_peer_group)
@staticmethod @staticmethod
def send_audit_peer_group(system_peers, peer_group): def send_audit_peer_group(
system_peers: list[models.SystemPeer], peer_group: models.SubcloudPeerGroup
):
if not system_peers: if not system_peers:
return return
local_system = utils.get_local_system() local_system = utils.get_local_system()

View File

@ -9,6 +9,7 @@ import threading
from fm_api import constants as fm_const from fm_api import constants as fm_const
from fm_api import fm_api from fm_api import fm_api
from keystoneauth1 import exceptions as ks_exceptions
from oslo_config import cfg from oslo_config import cfg
from oslo_log import log as logging from oslo_log import log as logging
@ -16,15 +17,58 @@ from dcmanager.common import consts
from dcmanager.common import context from dcmanager.common import context
from dcmanager.common import manager from dcmanager.common import manager
from dcmanager.db import api as db_api from dcmanager.db import api as db_api
from dcmanager.db.sqlalchemy import models
from dcmanager.manager import peer_group_audit_manager as pgam from dcmanager.manager import peer_group_audit_manager as pgam
from dcmanager.manager.subcloud_manager import SubcloudManager
from dcmanager.manager.system_peer_manager import SystemPeerManager from dcmanager.manager.system_peer_manager import SystemPeerManager
CONF = cfg.CONF CONF = cfg.CONF
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
# Upon detecting that the secondary site is REACHABLE again, the PGA sync_status
# will be set for BOTH sites by the primary site monitor thread as follows:
SECONDARY_SITE_REACHABLE_TRANSITIONS = {
# The UNKNOWN and FAILED are set when the secondary site becomes unreachable
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
consts.ASSOCIATION_SYNC_STATUS_FAILED: consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC,
# The SYNCING and IN_SYNC status can happen when first creating the peer and the
# association. The association creation will set the status to SYNCING/IN_SYNC,
# and the monitor thread might detect the peer availability after that, so we
# don't modify the status
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_SYNCING,
# A similar situation can happen with OUT_OF_SYNC, if the association/peer group
# is updated before the monitor thread runs its first check, so we don't modify
# the status
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: (
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
),
}
# Upon detecting that the secondary site is UNREACHABLE, the PGA sync_status
# will be set for the PIRMARY SITE ONLY by the primary site monitor thread as follows:
SECONDARY_SITE_UNREACHABLE_TRANSITIONS = {
# After the secondary site becomes unreachable, the only valid sync statuses
# are UNKNOWN or FAILED
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: consts.ASSOCIATION_SYNC_STATUS_UNKNOWN,
consts.ASSOCIATION_SYNC_STATUS_FAILED: consts.ASSOCIATION_SYNC_STATUS_FAILED,
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: consts.ASSOCIATION_SYNC_STATUS_FAILED,
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: consts.ASSOCIATION_SYNC_STATUS_UNKNOWN,
consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_FAILED,
}
class PeerMonitor(object): class PeerMonitor(object):
def __init__(self, peer, context, subcloud_manager): """Monitors and manages the connection status of a single DC system peer
The monitor runs in a separate thread and periodically checks the peer's
availability using heartbeat mechanisms. It handles both failure and recovery
scenarios, updating system states and managing fault alarms accordingly.
"""
def __init__(
self, peer: models.SystemPeer, context, subcloud_manager: SubcloudManager
):
self.peer = peer self.peer = peer
self.thread = None self.thread = None
self.exit_flag = threading.Event() self.exit_flag = threading.Event()
@ -32,21 +76,22 @@ class PeerMonitor(object):
self.context = context self.context = context
self.subcloud_manager = subcloud_manager self.subcloud_manager = subcloud_manager
self.peer_group_id_set = set() self.peer_group_id_set = set()
self.failure_count = 0
# key: peer_group_id # key: peer_group_id
# value: PeerGroupAuditManager object # value: PeerGroupAuditManager object
self.peer_group_audit_obj_map = dict() self.peer_group_audit_obj_map: dict[int, pgam.PeerGroupAuditManager] = dict()
def _clear_failure(self): def _clear_failure(self):
alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED
entity_instance_id = "peer=%s" % self.peer.peer_uuid entity_instance_id = f"peer={self.peer.peer_uuid}"
try: try:
fault = self.fm_api.get_fault(alarm_id, entity_instance_id) fault = self.fm_api.get_fault(alarm_id, entity_instance_id)
if fault: if fault:
self.fm_api.clear_fault(alarm_id, entity_instance_id) self.fm_api.clear_fault(alarm_id, entity_instance_id)
except Exception as e: except Exception as e:
LOG.exception( LOG.exception(
"Problem clearing fault for peer %s, alarm_id=%s error: %s" f"Problem clearing fault for peer {self.peer.peer_name}, "
% (self.peer.peer_uuid, alarm_id, e) f"alarm_id={alarm_id} error: {str(e)}"
) )
def _raise_failure(self): def _raise_failure(self):
@ -114,125 +159,166 @@ class PeerMonitor(object):
if not dc_peer_subcloud_peer_group_list: if not dc_peer_subcloud_peer_group_list:
LOG.warning( LOG.warning(
"Resource subcloud peer group of dc:%s not found" f"No subcloud peer groups found for DC peer: {self.peer.peer_name} "
% self.peer.manager_endpoint f"(endpoint: {self.peer.manager_endpoint})"
) )
except (ks_exceptions.ConnectFailure, ks_exceptions.ConnectTimeout) as e:
# Use warning level for common connection failures to reduce log spam
LOG.warning(f"Failed to access DC peer {self.peer.peer_name}: {str(e)}")
except Exception: except Exception:
LOG.exception("Failed to access the dc: %s" % self.peer.peer_name) LOG.exception(
f"Unexpected error while accessing DC peer {self.peer.peer_name}"
)
return failed, dc_peer_subcloud_peer_group_list return failed, dc_peer_subcloud_peer_group_list
def _update_sync_status_secondary_site_becomes_unreachable(self): def _update_sync_status_secondary_site_becomes_unreachable(self) -> None:
"""Update sync status on local site when secondary site becomes unreachable."""
# Get associations by system peer # Get associations by system peer
associations = SystemPeerManager.get_local_associations(self.context, self.peer) associations = SystemPeerManager.get_local_associations(self.context, self.peer)
for association in associations: for association in associations:
# If the association is not primary, skip it. # If the association is not primary, skip it.
if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug( LOG.info(
"Skip update the Association sync_status as it is not primary." f"Skip updating local association (id={association.id}) "
"sync_status as it's not primary"
) )
continue continue
# If the secondary site is down, set the association sync status
# "in-sync" -> "unknown" try:
# "unknown" -> "unknown" new_sync_status = SECONDARY_SITE_UNREACHABLE_TRANSITIONS[
# "out-of-sync" -> "failed" association.sync_status
# "syncing" -> "failed" ]
# "failed" -> "failed" except KeyError:
sync_status = consts.ASSOCIATION_SYNC_STATUS_UNKNOWN # This should never happen
message = f"Peer site ({self.peer.peer_name}) is unreachable." LOG.error(
if association.sync_status not in [ f"Unexpected sync status on association id={association.id}: "
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, f"{association.sync_status}. Updating the sync status to "
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, f"{consts.ASSOCIATION_SYNC_STATUS_FAILED}"
]: )
sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
LOG.info(
f"Updating local association (id={association.id}) sync_status "
f"from {association.sync_status} to {new_sync_status} due "
"to secondary site becoming unreachable"
)
db_api.peer_group_association_update( db_api.peer_group_association_update(
self.context, self.context,
association.id, association.id,
sync_status=sync_status, sync_status=new_sync_status,
sync_message=message, sync_message=f"Peer site ({self.peer.peer_name}) is unreachable.",
) )
def _update_sync_status_secondary_site_becomes_reachable(self): def _update_sync_status_secondary_site_becomes_reachable(self) -> None:
"""Update sync status on both sites when secondary site becomes reachable."""
# Get associations by system peer # Get associations by system peer
associations = SystemPeerManager.get_local_associations(self.context, self.peer) associations = SystemPeerManager.get_local_associations(self.context, self.peer)
for association in associations: for association in associations:
# If the association is not primary, skip it. # If the association is not primary, skip it.
if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug( LOG.info(
"Skip update Peer Site Association sync_status as " "Skip updating association sync_status on both sites as "
"current site Association is not primary." "current site association is not primary."
) )
continue continue
# Upon detecting that the secondary site is reachable again,
# the PGA sync_status will be set for both sites by the primary try:
# site monitor thread as follows: new_sync_status = SECONDARY_SITE_REACHABLE_TRANSITIONS[
# "unknown" -> "in-sync" association.sync_status
# "failed" -> "out-of-sync" ]
sync_status = consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC except KeyError:
if association.sync_status == consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: # This should never happen
sync_status = consts.ASSOCIATION_SYNC_STATUS_IN_SYNC LOG.error(
f"Unexpected sync status on association id={association.id}: "
f"{association.sync_status}. Updating the sync status to "
f"{consts.ASSOCIATION_SYNC_STATUS_FAILED}"
)
new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
dc_local_pg = db_api.subcloud_peer_group_get( dc_local_pg = db_api.subcloud_peer_group_get(
self.context, association.peer_group_id self.context, association.peer_group_id
) )
LOG.info(
f"Updating local association (id={association.id}) sync_status "
f"from {association.sync_status} to {new_sync_status} due "
"to secondary site becoming reachable"
)
SystemPeerManager.update_sync_status( SystemPeerManager.update_sync_status(
self.context, self.context,
self.peer, self.peer,
sync_status, new_sync_status,
dc_local_pg, dc_local_pg,
association=association, association=association,
) )
def _do_monitor_peer(self): def _update_peer_state(self, state: str) -> models.SystemPeer:
failure_count = 0 """Update peer availability state in database."""
LOG.info("Start monitoring thread for peer %s" % self.peer.peer_name) return db_api.system_peer_update(
UNAVAILABLE_STATE = consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE self.context, self.peer.id, availability_state=state
AVAILABLE_STATE = consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE )
# Do the actual peer monitor.
while not self.exit_flag.wait(timeout=self.peer.heartbeat_interval): def _handle_peer_failure(self) -> None:
try: """Handle peer failure by raising alarms and updating states."""
# Get system peer from DB self.failure_count += 1
self.peer = db_api.system_peer_get(self.context, self.peer.id) if self.failure_count >= self.peer.heartbeat_failure_threshold:
failed, remote_pg_list = self._heartbeat_check_via_get_peer_group_list()
if failed:
failure_count += 1
if failure_count >= self.peer.heartbeat_failure_threshold:
# heartbeat_failure_threshold reached.
LOG.warning( LOG.warning(
"DC %s heartbeat failed, Raising alarm" f"DC peer '{self.peer.peer_name}' heartbeat failed, raising alarm"
% self.peer.peer_name
) )
self._raise_failure() self._raise_failure()
db_api.system_peer_update( self._update_peer_state(consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE)
self.context,
self.peer.id,
availability_state=UNAVAILABLE_STATE,
)
# pylint: disable=line-too-long
self._update_sync_status_secondary_site_becomes_unreachable() self._update_sync_status_secondary_site_becomes_unreachable()
failure_count = 0 self.failure_count = 0
self._set_require_audit_flag_to_associated_peer_groups() self._set_require_audit_flag_to_associated_peer_groups()
else:
failure_count = 0 def _handle_peer_recovery(self, remote_pg_list: list) -> None:
"""Handle peer recovery by clearing alarms and restoring states."""
self.failure_count = 0
self._audit_local_peer_groups(remote_pg_list) self._audit_local_peer_groups(remote_pg_list)
if self.peer.availability_state != AVAILABLE_STATE: if (
db_api.system_peer_update( self.peer.availability_state
self.context, != consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE
self.peer.id, ):
availability_state=AVAILABLE_STATE, LOG.info(f"DC peer '{self.peer.peer_name}' is back online, clearing alarm")
) self._update_peer_state(consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE)
# pylint: disable=line-too-long
self._update_sync_status_secondary_site_becomes_reachable() self._update_sync_status_secondary_site_becomes_reachable()
LOG.info("DC %s back online, clear alarm" % self.peer.peer_name)
self._clear_failure() self._clear_failure()
def _check_peer_status(self) -> None:
"""Perform a single peer status check and handle the result."""
self.peer = db_api.system_peer_get(self.context, self.peer.id)
failed, remote_pg_list = self._heartbeat_check_via_get_peer_group_list()
if failed:
self._handle_peer_failure()
else:
self._handle_peer_recovery(remote_pg_list)
def _do_monitor_peer(self) -> None:
"""Monitor peer connectivity and handle state changes."""
LOG.info(f"Start monitoring thread for peer '{self.peer.peer_name}'")
# Run first check immediately
try:
self._check_peer_status()
except Exception as e:
LOG.exception(f"Initial check failed for peer '{self.peer.peer_name}': {e}")
# Continue with periodic checks
while not self.exit_flag.wait(timeout=self.peer.heartbeat_interval):
try:
self._check_peer_status()
except Exception as e: except Exception as e:
LOG.exception( LOG.exception(
"Got exception monitoring peer %s error: %s" f"Unexpected error monitoring peer '{self.peer.peer_name}': {e}"
% (self.peer.peer_name, e)
)
LOG.info(
"Caught graceful exit signal for peer monitor %s" % self.peer.peer_name
) )
LOG.info(f"Gracefully exiting monitor thread for peer '{self.peer.peer_name}'")
def _audit_local_peer_groups(self, remote_pg_list): def _audit_local_peer_groups(self, remote_pg_list):
# Generate a dict index by remote peer group name # Generate a dict index by remote peer group name
remote_pg_dict = { remote_pg_dict = {
@ -266,7 +352,9 @@ class PeerMonitor(object):
for pgam_obj in self.peer_group_audit_obj_map.values(): for pgam_obj in self.peer_group_audit_obj_map.values():
pgam_obj.require_audit_flag = True pgam_obj.require_audit_flag = True
def audit_specific_local_peer_group(self, peer_group, remote_peer_group): def audit_specific_local_peer_group(
self, peer_group: models.SubcloudPeerGroup, remote_peer_group
):
msg = None msg = None
if peer_group.id in self.peer_group_audit_obj_map: if peer_group.id in self.peer_group_audit_obj_map:
pgam_obj = self.peer_group_audit_obj_map[peer_group.id] pgam_obj = self.peer_group_audit_obj_map[peer_group.id]
@ -324,17 +412,16 @@ class PeerMonitor(object):
class PeerMonitorManager(manager.Manager): class PeerMonitorManager(manager.Manager):
"""Manages tasks related to peer monitor.""" """Manages tasks related to peer monitor."""
def __init__(self, subcloud_manager): def __init__(self, subcloud_manager: SubcloudManager):
LOG.debug("PeerMonitorManager initialization...") LOG.debug("PeerMonitorManager initialization...")
super(PeerMonitorManager, self).__init__(service_name="peer_monitor_manager") super(PeerMonitorManager, self).__init__(service_name="peer_monitor_manager")
self.peer_monitor = dict()
self.context = context.get_admin_context() self.context = context.get_admin_context()
self.subcloud_manager = subcloud_manager self.subcloud_manager = subcloud_manager
# key: system_peer_id # key: system_peer_id
# value: PeerMonitor object # value: PeerMonitor object
self.peer_monitor_thread_map = dict() self.peer_monitor_thread_map: dict[int, PeerMonitor] = dict()
def _remove_peer_monitor_task(self, system_peer_id): def _remove_peer_monitor_task(self, system_peer_id):
peer_mon_obj = self.peer_monitor_thread_map[system_peer_id] peer_mon_obj = self.peer_monitor_thread_map[system_peer_id]

View File

@ -3674,6 +3674,11 @@ class SubcloudManager(manager.Manager):
and systemcontroller_gateway_ip.split(",")[0] and systemcontroller_gateway_ip.split(",")[0]
!= subcloud.systemcontroller_gateway_ip != subcloud.systemcontroller_gateway_ip
): ):
LOG.info(
f"The systemcontroller_gateway_ip for subcloud {subcloud.name} "
f"was updated from {subcloud.systemcontroller_gateway_ip} to "
f"{systemcontroller_gateway_ip.split(',')[0]}. Replacing routes..."
)
m_ks_client = OpenStackDriver( m_ks_client = OpenStackDriver(
region_name=dccommon_consts.DEFAULT_REGION_NAME, region_name=dccommon_consts.DEFAULT_REGION_NAME,
region_clients=None, region_clients=None,

View File

@ -9,6 +9,7 @@ from contextlib import nullcontext
import functools import functools
import json import json
import tempfile import tempfile
from typing import Optional
from eventlet import greenpool from eventlet import greenpool
from oslo_log import log as logging from oslo_log import log as logging
@ -54,7 +55,9 @@ class SystemPeerManager(manager.Manager):
) )
@staticmethod @staticmethod
def get_local_associations(ctx, peer, local_pg=None): def get_local_associations(
ctx, peer: models.SystemPeer, local_pg: models.SubcloudPeerGroup = None
) -> list[models.PeerGroupAssociation]:
if local_pg is None: if local_pg is None:
# Get associations by system peer id # Get associations by system peer id
return db_api.peer_group_association_get_by_system_peer_id(ctx, peer.id) return db_api.peer_group_association_get_by_system_peer_id(ctx, peer.id)
@ -93,8 +96,12 @@ class SystemPeerManager(manager.Manager):
""" """
def _update_association_on_peer_site( def _update_association_on_peer_site(
peer, sync_status, local_pg, remote_pg, message peer: models.SystemPeer,
): sync_status: str,
local_pg: models.SubcloudPeerGroup,
remote_pg: dict,
message: str,
) -> tuple[str, str]:
try: try:
# Get peer site dcmanager client # Get peer site dcmanager client
dc_client = SystemPeerManager.get_peer_dc_client(peer) dc_client = SystemPeerManager.get_peer_dc_client(peer)
@ -118,21 +125,21 @@ class SystemPeerManager(manager.Manager):
# Update peer site association sync_status only if the # Update peer site association sync_status only if the
# sync_status is different from the current sync_status # sync_status is different from the current sync_status
if dc_peer_association.get("sync_status") != sync_status: if dc_peer_association.get("sync-status") != sync_status:
# Update peer site association sync_status # Update peer site association sync_status
dc_peer_association_id = dc_peer_association.get("id") dc_peer_association_id = dc_peer_association.get("id")
dc_client.update_peer_group_association_sync_status( dc_client.update_peer_group_association_sync_status(
dc_peer_association_id, sync_status dc_peer_association_id, sync_status
) )
LOG.info( LOG.info(
f"Updated Peer site {dc_peer_system_peer.get('id')} " f"Updated peer site '{dc_peer_system_peer.get('peer-name')}' "
f"Peer Group Association {dc_peer_association_id} " f"peer group association {dc_peer_association_id} "
f"sync_status to {sync_status}." f"sync_status to '{sync_status}'"
) )
except Exception as e: except Exception as e:
message = ( message = (
f"Failed to Update Peer Site ({peer.peer_uuid}) " f"Failed to update peer site '{peer.peer_name}' "
f"Association sync_status to {sync_status}." f"association sync_status to '{sync_status}'"
) )
LOG.exception(f"{message} Error: {e}") LOG.exception(f"{message} Error: {e}")
sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
@ -146,9 +153,9 @@ class SystemPeerManager(manager.Manager):
for association in associations: for association in associations:
if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY: if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug( LOG.info(
f"Skip update Peer Site association sync_status to {sync_status} " "Skipping updating peer and local site association "
"as current site Association is not primary." f"sync_status to '{sync_status}' as local site is not primary"
) )
continue continue
@ -165,13 +172,16 @@ class SystemPeerManager(manager.Manager):
association.sync_status == sync_status association.sync_status == sync_status
and sync_status != consts.ASSOCIATION_SYNC_STATUS_FAILED and sync_status != consts.ASSOCIATION_SYNC_STATUS_FAILED
): ):
LOG.debug( LOG.info(
"Skip update current site association sync_status to " f"Local association {association.id} sync_status "
f"{sync_status} as current site Association is already " f"already set to '{sync_status}', skipping update"
"in the same status."
) )
continue continue
# Update primary site association sync_status # Update primary site association sync_status
LOG.info(
f"Updating local association {association.id} sync_status "
f"from '{association.sync_status}' to '{sync_status}'"
)
db_api.peer_group_association_update( db_api.peer_group_association_update(
ctx, association.id, sync_status=sync_status, sync_message=message ctx, association.id, sync_status=sync_status, sync_message=message
) )
@ -222,10 +232,11 @@ class SystemPeerManager(manager.Manager):
dccommon_consts.SYSTEM_CONTROLLER_NAME, dccommon_consts.SYSTEM_CONTROLLER_NAME,
p_ks_client.session, p_ks_client.session,
endpoint=dc_endpoint, endpoint=dc_endpoint,
peer=peer,
) )
@staticmethod @staticmethod
def get_peer_subcloud(dc_client, subcloud_name): def get_peer_subcloud(dc_client: DcmanagerClient, subcloud_name: str) -> dict:
"""Get subcloud on peer site if exist. """Get subcloud on peer site if exist.
:param dc_client: the dcmanager client object :param dc_client: the dcmanager client object
@ -235,7 +246,10 @@ class SystemPeerManager(manager.Manager):
peer_subcloud = dc_client.get_subcloud(subcloud_name) peer_subcloud = dc_client.get_subcloud(subcloud_name)
return peer_subcloud return peer_subcloud
except dccommon_exceptions.SubcloudNotFound: except dccommon_exceptions.SubcloudNotFound:
LOG.warn(f"Subcloud {subcloud_name} does not exist on peer site.") LOG.warn(
f"Subcloud '{subcloud_name}' does not exist on "
f"peer site '{dc_client.peer.peer_name}'"
)
@staticmethod @staticmethod
def get_subcloud_deploy_status(subcloud): def get_subcloud_deploy_status(subcloud):
@ -258,7 +272,9 @@ class SystemPeerManager(manager.Manager):
return True return True
@staticmethod @staticmethod
def delete_peer_secondary_subcloud(dc_client, subcloud_ref): def delete_peer_secondary_subcloud(
dc_client: DcmanagerClient, subcloud_ref: str
) -> None:
"""Delete secondary subcloud on peer site. """Delete secondary subcloud on peer site.
:param dc_client: the dcmanager client object :param dc_client: the dcmanager client object
@ -284,14 +300,17 @@ class SystemPeerManager(manager.Manager):
return return
dc_client.delete_subcloud(subcloud_ref) dc_client.delete_subcloud(subcloud_ref)
LOG.info(f"Deleted Subcloud {subcloud_ref} on peer site.") LOG.info(
f"Deleted subcloud '{subcloud_ref}' on peer site "
f"'{dc_client.peer.peer_name}'"
)
@staticmethod @staticmethod
def _run_parallel_group_operation(op_type, op_function, thread_pool, subclouds): def _run_parallel_group_operation(op_type, op_function, thread_pool, subclouds):
"""Run parallel group operation on subclouds.""" """Run parallel group operation on subclouds."""
failed_subclouds = [] failed_subclouds = []
processed = 0 processed = 0
error_msg = {} # Dictinary to store error message for each subcloud error_msg = {} # Dictionary to store error message for each subcloud
for subcloud, success in thread_pool.imap(op_function, subclouds): for subcloud, success in thread_pool.imap(op_function, subclouds):
processed += 1 processed += 1
@ -365,9 +384,9 @@ class SystemPeerManager(manager.Manager):
region_name, files, data, is_region_name=True region_name, files, data, is_region_name=True
) )
LOG.info( LOG.info(
f"Updated Subcloud {dc_peer_subcloud.get('name')} " f"Updated subcloud '{dc_peer_subcloud.get('name')}' "
f"(region_name: {dc_peer_subcloud.get('region-name')}) " f"(region_name: {dc_peer_subcloud.get('region-name')}) "
"on peer site." f"on peer site '{dc_client.peer.peer_name}'"
) )
else: else:
# Create subcloud on peer site if not exist # Create subcloud on peer site if not exist
@ -375,13 +394,15 @@ class SystemPeerManager(manager.Manager):
files, data files, data
) )
LOG.info( LOG.info(
f"Created Subcloud {dc_peer_subcloud.get('name')} " f"Created subcloud '{dc_peer_subcloud.get('name')}' "
f"(region_name: {dc_peer_subcloud.get('region-name')}) " f"(region_name: {dc_peer_subcloud.get('region-name')}) "
"on peer site." f"on peer site '{dc_client.peer.peer_name}'"
) )
LOG.debug( LOG.info(
f"Updating subcloud {subcloud_name} (region_name: {region_name}) " f"Updating subcloud '{subcloud_name}' (region_name: "
f"with subcloud peer group id {dc_peer_pg_id} on peer site." f"{dc_peer_subcloud.get('region-name')}) with subcloud "
f"peer group id {dc_peer_pg_id} on peer site "
f"'{dc_client.peer.peer_name}'"
) )
# Update subcloud associated peer group on peer site. # Update subcloud associated peer group on peer site.
# The peer_group update will check the header and should # The peer_group update will check the header and should
@ -413,12 +434,15 @@ class SystemPeerManager(manager.Manager):
except Exception as e: except Exception as e:
subcloud.msg = str(e) # Store error message for subcloud subcloud.msg = str(e) # Store error message for subcloud
LOG.error( LOG.error(
f"Failed to add/update Subcloud {subcloud_name} " f"Failed to add/update subcloud '{subcloud_name}' "
f"(region_name: {region_name}) on peer site: {str(e)}" f"(region_name: {region_name}) on peer site "
f"'{dc_client.peer.peer_name}': {str(e)}"
) )
return subcloud, False return subcloud, False
def _delete_subcloud(self, dc_client, subcloud): def _delete_subcloud(
self, dc_client: DcmanagerClient, subcloud: models.Subcloud
) -> tuple[models.Subcloud, bool]:
"""Delete subcloud on peer site in parallel.""" """Delete subcloud on peer site in parallel."""
try: try:
subcloud_name = subcloud.get("name") subcloud_name = subcloud.get("name")
@ -427,11 +451,12 @@ class SystemPeerManager(manager.Manager):
except Exception as e: except Exception as e:
subcloud.msg = str(e) subcloud.msg = str(e)
LOG.exception( LOG.exception(
f"Failed to delete Subcloud {subcloud_name} on peer site: {str(e)}" f"Failed to delete subcloud '{subcloud_name}' on "
f"peer site '{dc_client.peer.peer_name}': {str(e)}"
) )
return subcloud, False return subcloud, False
def _is_valid_for_subcloud_sync(self, subcloud): def _is_valid_for_subcloud_sync(self, subcloud: models.Subcloud) -> str:
"""Verify subcloud data for sync.""" """Verify subcloud data for sync."""
subcloud_name = subcloud.get("name") subcloud_name = subcloud.get("name")
region_name = subcloud.get("region_name") region_name = subcloud.get("region_name")
@ -485,10 +510,12 @@ class SystemPeerManager(manager.Manager):
return VERIFY_SUBCLOUD_SYNC_VALID return VERIFY_SUBCLOUD_SYNC_VALID
def _validate_subclouds_for_sync(self, subclouds, dc_client): def _validate_subclouds_for_sync(
self, subclouds: list[models.Subcloud], dc_client: DcmanagerClient
) -> tuple[list[models.Subcloud], dict[str, str]]:
"""Validate subclouds for sync.""" """Validate subclouds for sync."""
valid_subclouds = [] valid_subclouds = []
error_msg = {} # Dictinary to store error message for each subcloud error_msg = {} # Dictionary to store error message for each subcloud
for subcloud in subclouds: for subcloud in subclouds:
subcloud_name = subcloud.get("name") subcloud_name = subcloud.get("name")
@ -510,8 +537,8 @@ class SystemPeerManager(manager.Manager):
peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name) peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name)
if not peer_subcloud: if not peer_subcloud:
LOG.info( LOG.info(
f"Subcloud {subcloud_name} (region_name: {region_name}) does " f"Subcloud '{subcloud_name}' (region_name: {region_name}) does "
"not exist on peer site." f"not exist on peer site '{dc_client.peer.peer_name}'"
) )
valid_subclouds.append(subcloud) valid_subclouds.append(subcloud)
continue continue
@ -541,7 +568,9 @@ class SystemPeerManager(manager.Manager):
return valid_subclouds, error_msg return valid_subclouds, error_msg
def _sync_subclouds(self, context, peer, dc_local_pg_id, dc_peer_pg_id): def _sync_subclouds(
self, context, peer: models.SystemPeer, dc_local_pg_id: int, dc_peer_pg_id: int
) -> dict[str, str]:
"""Sync subclouds of local peer group to peer site. """Sync subclouds of local peer group to peer site.
:param context: request context object :param context: request context object
@ -572,7 +601,10 @@ class SystemPeerManager(manager.Manager):
) )
error_msg.update(sync_error_msg) error_msg.update(sync_error_msg)
LOG.info("Subcloud peer sync operation finished") LOG.info(
f"Completed subcloud peer sync with peer '{peer.peer_name}': processed "
f"{len(subclouds_to_sync)} subclouds, {len(failed_subclouds)} failed"
)
dc_local_region_names = set() dc_local_region_names = set()
for subcloud in subclouds: for subcloud in subclouds:
@ -591,7 +623,10 @@ class SystemPeerManager(manager.Manager):
dc_peer_subcloud_diff_names = dc_peer_region_names - dc_local_region_names dc_peer_subcloud_diff_names = dc_peer_region_names - dc_local_region_names
for subcloud_to_delete in dc_peer_subcloud_diff_names: for subcloud_to_delete in dc_peer_subcloud_diff_names:
try: try:
LOG.debug(f"Deleting Subcloud name {subcloud_to_delete} on peer site.") LOG.info(
f"Deleting subcloud '{subcloud_to_delete}' on "
f"peer site '{dc_client.peer.peer_name}'"
)
self.delete_peer_secondary_subcloud(dc_client, subcloud_to_delete) self.delete_peer_secondary_subcloud(dc_client, subcloud_to_delete)
except Exception as e: except Exception as e:
msg = f"Subcloud delete failed: {str(e)}" msg = f"Subcloud delete failed: {str(e)}"
@ -664,7 +699,7 @@ class SystemPeerManager(manager.Manager):
context, peer_group_id context, peer_group_id
) )
if not associations: if not associations:
LOG.debug("No association found for peer group %s" % peer_group_id) LOG.info(f"No association found for peer group '{peer_group_id}'")
else: else:
for association in associations: for association in associations:
if sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: if sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC:
@ -722,7 +757,7 @@ class SystemPeerManager(manager.Manager):
sync_status=new_sync_status, sync_status=new_sync_status,
sync_message=new_sync_message, sync_message=new_sync_message,
) )
LOG.debug( LOG.info(
f"Updated Local Peer Group Association {association.id} " f"Updated Local Peer Group Association {association.id} "
f"sync_status to {new_sync_status}." f"sync_status to {new_sync_status}."
) )
@ -786,8 +821,11 @@ class SystemPeerManager(manager.Manager):
return success_peer_ids, failed_peer_ids return success_peer_ids, failed_peer_ids
def _get_non_primary_association( def _get_non_primary_association(
self, dc_client, dc_peer_system_peer_id, dc_peer_pg_id self,
): dc_client: DcmanagerClient,
dc_peer_system_peer_id: int,
dc_peer_pg_id: int,
) -> Optional[dict]:
"""Get non-primary Association from peer site.""" """Get non-primary Association from peer site."""
try: try:
return dc_client.get_peer_group_association_with_peer_id_and_pg_id( return dc_client.get_peer_group_association_with_peer_id_and_pg_id(
@ -795,18 +833,23 @@ class SystemPeerManager(manager.Manager):
) )
except dccommon_exceptions.PeerGroupAssociationNotFound: except dccommon_exceptions.PeerGroupAssociationNotFound:
LOG.error( LOG.error(
"Peer Group association does not exist on peer site. " "Peer group association does not exist on peer site "
f"Peer Group ID: {dc_peer_pg_id}, Peer System Peer ID: " f"'{dc_client.peer.peer_name}'. Peer group ID: {dc_peer_pg_id}, "
f"{dc_peer_system_peer_id}" f"peer system peer ID: {dc_peer_system_peer_id}"
) )
return None return None
def _get_peer_site_pg_by_name(self, dc_client, peer_group_name): def _get_peer_site_pg_by_name(
self, dc_client: DcmanagerClient, peer_group_name: str
) -> Optional[dict]:
"""Get remote Peer Group from peer site by name.""" """Get remote Peer Group from peer site by name."""
try: try:
return dc_client.get_subcloud_peer_group(peer_group_name) return dc_client.get_subcloud_peer_group(peer_group_name)
except dccommon_exceptions.SubcloudPeerGroupNotFound: except dccommon_exceptions.SubcloudPeerGroupNotFound:
LOG.error(f"Peer Group {peer_group_name} does not exist on peer site.") LOG.error(
f"Peer group '{peer_group_name}' does not exist on "
f"peer site '{dc_client.peer.peer_name}'"
)
return None return None
def _get_peer_site_system_peer(self, dc_client, peer_uuid=None): def _get_peer_site_system_peer(self, dc_client, peer_uuid=None):
@ -838,11 +881,6 @@ class SystemPeerManager(manager.Manager):
:param association_id: id of association to sync :param association_id: id of association to sync
:param sync_subclouds: Enabled to sync subclouds to peer site :param sync_subclouds: Enabled to sync subclouds to peer site
""" """
LOG.info(
f"Synchronize the association {association_id} of the "
"Subcloud Peer Group with the System Peer pointing to the "
"peer site."
)
association = db_api.peer_group_association_get(context, association_id) association = db_api.peer_group_association_get(context, association_id)
peer = db_api.system_peer_get(context, association.system_peer_id) peer = db_api.system_peer_get(context, association.system_peer_id)
@ -850,14 +888,19 @@ class SystemPeerManager(manager.Manager):
peer_group_name = dc_local_pg.peer_group_name peer_group_name = dc_local_pg.peer_group_name
dc_peer_association_id = None dc_peer_association_id = None
LOG.info(
f"Syncing peer group association {association_id} that links subcloud "
f"peer group '{peer_group_name}' with system peer '{peer.peer_name}'"
)
try: try:
# Check if the system_uuid of the peer site matches with the # Check if the system_uuid of the peer site matches with the
# peer_uuid # peer_uuid
system = self.get_peer_sysinv_client(peer).get_system() system = self.get_peer_sysinv_client(peer).get_system()
if system.uuid != peer.peer_uuid: if system.uuid != peer.peer_uuid:
LOG.error( LOG.error(
f"Peer site system uuid {system.uuid} does not match " f"Peer site '{peer.peer_name}' system uuid {system.uuid} does "
f"with the peer_uuid {peer.peer_uuid}" f"not match the configured peer_uuid: {peer.peer_uuid}"
) )
raise exceptions.PeerGroupAssociationTargetNotMatch(uuid=system.uuid) raise exceptions.PeerGroupAssociationTargetNotMatch(uuid=system.uuid)
@ -873,7 +916,8 @@ class SystemPeerManager(manager.Manager):
if dc_peer_system_peer is None: if dc_peer_system_peer is None:
failed_message = ( failed_message = (
f"System Peer {local_system_uuid} does not exist on peer site." f"A system peer with UUID of {local_system_uuid} "
f"does not exist on peer site '{peer.peer_name}'"
) )
return db_api.peer_group_association_db_model_to_dict( return db_api.peer_group_association_db_model_to_dict(
self._update_sync_status_to_failed( self._update_sync_status_to_failed(
@ -896,7 +940,7 @@ class SystemPeerManager(manager.Manager):
dc_peer_pg = dc_client.add_subcloud_peer_group(**peer_group_kwargs) dc_peer_pg = dc_client.add_subcloud_peer_group(**peer_group_kwargs)
LOG.info( LOG.info(
f"Created Subcloud Peer Group {peer_group_name} on " f"Created Subcloud Peer Group {peer_group_name} on "
f"peer site. ID is {dc_peer_pg.get('id')}." f"peer site '{peer.peer_name}'. ID is {dc_peer_pg.get('id')}."
) )
dc_peer_pg_id = dc_peer_pg.get("id") dc_peer_pg_id = dc_peer_pg.get("id")
dc_peer_pg_priority = dc_peer_pg.get("group_priority") dc_peer_pg_priority = dc_peer_pg.get("group_priority")
@ -921,7 +965,7 @@ class SystemPeerManager(manager.Manager):
) )
LOG.info( LOG.info(
"Created 'non-primary' Peer Group Association " "Created 'non-primary' Peer Group Association "
f"{dc_peer_association.get('id')} on peer site." f"{dc_peer_association.get('id')} on peer site '{peer.peer_name}'"
) )
dc_peer_association_id = dc_peer_association.get("id") dc_peer_association_id = dc_peer_association.get("id")
@ -943,7 +987,7 @@ class SystemPeerManager(manager.Manager):
) )
LOG.info( LOG.info(
f"Updated Subcloud Peer Group {peer_group_name} on " f"Updated Subcloud Peer Group {peer_group_name} on "
f"peer site, ID is {dc_peer_pg.get('id')}." f"peer site '{peer.peer_name}', ID is {dc_peer_pg.get('id')}"
) )
association_update = { association_update = {
@ -998,7 +1042,6 @@ class SystemPeerManager(manager.Manager):
:param context: request context object. :param context: request context object.
:param association_id: id of association to delete :param association_id: id of association to delete
""" """
LOG.info(f"Deleting association peer group {association_id}.")
# Retrieve the peer group association details from the database # Retrieve the peer group association details from the database
association = db_api.peer_group_association_get(context, association_id) association = db_api.peer_group_association_get(context, association_id)
@ -1006,6 +1049,11 @@ class SystemPeerManager(manager.Manager):
dc_local_pg = db_api.subcloud_peer_group_get(context, association.peer_group_id) dc_local_pg = db_api.subcloud_peer_group_get(context, association.peer_group_id)
peer_group_name = dc_local_pg.peer_group_name peer_group_name = dc_local_pg.peer_group_name
LOG.info(
f"Deleting peer group association ({association_id=}, "
f"peer={peer.peer_name}, peer_group_id={dc_local_pg.id})"
)
try: try:
# Check if the system_uuid of the peer site matches with the # Check if the system_uuid of the peer site matches with the
# peer_uuid # peer_uuid
@ -1035,7 +1083,7 @@ class SystemPeerManager(manager.Manager):
# be deleted # be deleted
LOG.warning( LOG.warning(
f"Subcloud Peer Group {peer_group_name} does " f"Subcloud Peer Group {peer_group_name} does "
"not exist on peer site." f"not exist on peer site '{peer.peer_name}'"
) )
return self._delete_primary_association(context, association_id) return self._delete_primary_association(context, association_id)
@ -1045,7 +1093,7 @@ class SystemPeerManager(manager.Manager):
if dc_peer_pg_priority == 0: if dc_peer_pg_priority == 0:
LOG.error( LOG.error(
f"Failed to delete peer_group_association. Peer Group: " f"Failed to delete peer_group_association. Peer Group: "
f"{peer_group_name} has priority 0 on peer site." f"{peer_group_name} has priority 0 on peer site '{peer.peer_name}'"
) )
raise exceptions.SubcloudPeerGroupHasWrongPriority( raise exceptions.SubcloudPeerGroupHasWrongPriority(
priority=dc_peer_pg_priority priority=dc_peer_pg_priority
@ -1064,11 +1112,13 @@ class SystemPeerManager(manager.Manager):
if delete_error_msg: if delete_error_msg:
LOG.error( LOG.error(
"Failed to delete subcloud(s) from the Subcloud Peer Group " "Failed to delete subcloud(s) from the Subcloud Peer Group "
f"{peer_group_name} on peer site: {json.dumps(delete_error_msg)}" f"'{peer_group_name}' on peer site '{peer.peer_name}': "
f"{json.dumps(delete_error_msg)}"
) )
sync_message = ( sync_message = (
f"Deletion of {list(delete_error_msg.keys())} from the " f"Deletion of {list(delete_error_msg.keys())} from the "
f"Subcloud Peer Group: {peer_group_name} on the peer site failed." f"Subcloud Peer Group: {peer_group_name} on the peer site "
f"'{peer.peer_name}' failed."
) )
self._update_sync_status_to_failed( self._update_sync_status_to_failed(
context, association_id, sync_message context, association_id, sync_message
@ -1080,12 +1130,14 @@ class SystemPeerManager(manager.Manager):
try: try:
dc_client.delete_subcloud_peer_group(peer_group_name) dc_client.delete_subcloud_peer_group(peer_group_name)
LOG.info( LOG.info(
f"Deleted Subcloud Peer Group {peer_group_name} on peer site." f"Deleted Subcloud Peer Group {peer_group_name} on "
f"peer site '{peer.peer_name}'"
) )
except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated: except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated:
LOG.error( LOG.error(
f"Subcloud Peer Group {peer_group_name} delete failed " f"Subcloud Peer Group {peer_group_name} delete failed "
"as it is associated with System Peer on peer site." f"as it is associated with System Peer on peer site "
f"'{peer.peer_name}'"
) )
return self._delete_primary_association(context, association_id) return self._delete_primary_association(context, association_id)
dc_peer_system_peer_id = dc_peer_system_peer.get("id") dc_peer_system_peer_id = dc_peer_system_peer.get("id")
@ -1100,18 +1152,21 @@ class SystemPeerManager(manager.Manager):
dc_client.delete_peer_group_association(dc_peer_association_id) dc_client.delete_peer_group_association(dc_peer_association_id)
elif dc_peer_association is None: elif dc_peer_association is None:
LOG.warning( LOG.warning(
f"PeerGroupAssociation does not exist on peer site. " f"PeerGroupAssociation does not exist on peer site "
f"Peer Group ID: {dc_peer_pg_id}, peer site System " f"'{peer.peer_name}'. Peer Group ID: {dc_peer_pg_id}, "
f"Peer ID: {dc_peer_system_peer_id}" f"peer site System Peer ID: {dc_peer_system_peer_id}"
) )
try: try:
dc_client.delete_subcloud_peer_group(peer_group_name) dc_client.delete_subcloud_peer_group(peer_group_name)
LOG.info(f"Deleted Subcloud Peer Group {peer_group_name} on peer site.") LOG.info(
f"Deleted Subcloud Peer Group {peer_group_name} on "
f"peer site '{peer.peer_name}'"
)
except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated: except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated:
failed_message = ( failed_message = (
f"Subcloud Peer Group {peer_group_name} delete failed as it " f"Subcloud Peer Group {peer_group_name} delete failed as it "
"is associated with system peer on peer site." f"is associated with system peer on peer site '{peer.peer_name}'"
) )
self._update_sync_status_to_failed( self._update_sync_status_to_failed(
context, association_id, failed_message context, association_id, failed_message

View File

@ -107,26 +107,57 @@ class TestPeerMonitor(base.DCManagerTestCase):
) )
def test_update_sync_status_and_association_is_non_primary(self): def test_update_sync_status_and_association_is_non_primary(self):
# Delete the primary association created during setUp
db_api.peer_group_association_destroy(self.ctx, self.association.id)
# Create a non-primary association
association = self.system_peer_manager.create_peer_group_association_static( association = self.system_peer_manager.create_peer_group_association_static(
self.ctx, self.ctx,
system_peer_id=self.peer.id, system_peer_id=self.peer.id,
peer_group_id=self.peer_group2.id, peer_group_id=self.peer_group2.id,
association_type=consts.ASSOCIATION_TYPE_NON_PRIMARY, association_type=consts.ASSOCIATION_TYPE_NON_PRIMARY,
sync_status=consts.ASSOCIATION_SYNC_STATUS_UNKNOWN,
) )
self.mock_get_peer_dc_client().get_subcloud_peer_group.return_value = {
"id": FAKE_SITE1_PEER_GROUP_ID
}
# Test the case where the association is non-primary # Mock update_sync_status
self.peer_monitor._update_sync_status_secondary_site_becomes_reachable() mock_patch = mock.patch.object(
self.mock_get_peer_dc_client().get_subcloud_peer_group.assert_called_once_with( peer_monitor_manager.SystemPeerManager, "update_sync_status"
self.peer_group1.peer_group_name
) )
mock_update_sync_status = mock_patch.start()
self.addCleanup(mock_patch.stop)
self.peer_monitor._update_sync_status_secondary_site_becomes_reachable()
# Assert that the association sync status was not updated
mock_update_sync_status.assert_not_called()
association_new = db_api.peer_group_association_get(self.ctx, association.id) association_new = db_api.peer_group_association_get(self.ctx, association.id)
self.assertEqual(
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, association_new.sync_status
)
def test_update_sync_status_and_association_is_in_sync(self):
# Mock update_sync_status
mock_patch = mock.patch.object(
peer_monitor_manager.SystemPeerManager, "update_sync_status"
)
mock_update_sync_status = mock_patch.start()
self.addCleanup(mock_patch.stop)
self.peer_monitor._update_sync_status_secondary_site_becomes_reachable()
# Assert that the association sync status was not updated as it's
# already in-sync
association_new = db_api.peer_group_association_get(
self.ctx, self.association.id
)
self.assertEqual( self.assertEqual(
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, association_new.sync_status consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, association_new.sync_status
) )
# But the update_sync_status still has to be triggered so it handles any
# sync message update
mock_update_sync_status.assert_called_once()
def test_update_sync_status_secondary_site_becomes_reachable(self): def test_update_sync_status_secondary_site_becomes_reachable(self):
self.mock_get_local_system.return_value = test_system_peer_manager.FakeSystem( self.mock_get_local_system.return_value = test_system_peer_manager.FakeSystem(
FAKE_SITE0_SYSTEM_UUID FAKE_SITE0_SYSTEM_UUID
@ -286,7 +317,7 @@ class TestPeerMonitor(base.DCManagerTestCase):
mock_event.side_effect = [False, False, True] mock_event.side_effect = [False, False, True]
self.peer_monitor._do_monitor_peer() self.peer_monitor._do_monitor_peer()
self.mock_log.exception.assert_called_with( self.mock_log.exception.assert_called_with(
f"Problem clearing fault for peer {self.peer.peer_uuid}, alarm_id=" f"Problem clearing fault for peer {self.peer.peer_name}, alarm_id="
f"{fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED} error: boom" f"{fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED} error: boom"
) )
@ -319,7 +350,7 @@ class TestPeerMonitor(base.DCManagerTestCase):
mock_event.side_effect = [False, False, True] mock_event.side_effect = [False, False, True]
self.peer_monitor._do_monitor_peer() self.peer_monitor._do_monitor_peer()
self.mock_log.exception.assert_called_with( self.mock_log.exception.assert_called_with(
"Got exception monitoring peer PeerSite1 error: boom" "Unexpected error monitoring peer 'PeerSite1': boom"
) )
def test_heartbeat_check_via_get_peer_group_list_pg_not_found(self): def test_heartbeat_check_via_get_peer_group_list_pg_not_found(self):
@ -327,8 +358,8 @@ class TestPeerMonitor(base.DCManagerTestCase):
ret = self.peer_monitor._heartbeat_check_via_get_peer_group_list() ret = self.peer_monitor._heartbeat_check_via_get_peer_group_list()
self.mock_get_peer_dc_client.assert_called() self.mock_get_peer_dc_client.assert_called()
self.mock_log.warning.assert_called_once_with( self.mock_log.warning.assert_called_once_with(
"Resource subcloud peer group of dc:" "No subcloud peer groups found for DC peer: PeerSite1 "
"http://128.128.128.128:5000/v3 not found" "(endpoint: http://128.128.128.128:5000/v3)"
) )
self.assertEqual((False, []), ret) self.assertEqual((False, []), ret)

View File

@ -143,6 +143,7 @@ class TestSystemPeerManager(base.DCManagerTestCase):
mock_patch = mock.patch.object(system_peer_manager, "DcmanagerClient") mock_patch = mock.patch.object(system_peer_manager, "DcmanagerClient")
self.mock_dc_client = mock_patch.start() self.mock_dc_client = mock_patch.start()
self.mock_dc_client().peer.peer_name = "SystemPeer1"
self.addCleanup(mock_patch.stop) self.addCleanup(mock_patch.stop)
def _mock_system_peer_manager_peersitedriver(self): def _mock_system_peer_manager_peersitedriver(self):
@ -474,7 +475,7 @@ class TestSystemPeerManager(base.DCManagerTestCase):
"dcmanager.manager.system_peer_manager.SystemPeerManager.get_peer_dc_client" "dcmanager.manager.system_peer_manager.SystemPeerManager.get_peer_dc_client"
) )
def test_update_sync_status_exception(self, mock_client): def test_update_sync_status_exception(self, mock_client):
mock_client.return_value = Exception("boom") mock_client.side_effect = Exception("boom")
self.spm.update_sync_status( self.spm.update_sync_status(
self.ctx, self.peer, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC self.ctx, self.peer, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC
) )
@ -673,8 +674,8 @@ class TestSystemPeerManager(base.DCManagerTestCase):
self.ctx, self.peer, self.peer_group.id, FAKE_SITE1_PEER_GROUP_ID self.ctx, self.peer, self.peer_group.id, FAKE_SITE1_PEER_GROUP_ID
) )
self.mock_log.error.assert_called_once_with( self.mock_log.error.assert_called_once_with(
f"Failed to add/update Subcloud {subcloud1.name} " f"Failed to add/update subcloud '{subcloud1.name}' "
f"(region_name: {subcloud1.region_name}) on peer site: boom" f"(region_name: {subcloud1.region_name}) on peer site 'SystemPeer1': boom"
) )
def test_sync_subclouds_delete_subcloud_exception(self): def test_sync_subclouds_delete_subcloud_exception(self):
@ -743,8 +744,11 @@ class TestSystemPeerManager(base.DCManagerTestCase):
self.mock_dc_client().get_subcloud.side_effect = [peer_subcloud4] self.mock_dc_client().get_subcloud.side_effect = [peer_subcloud4]
self.peer_group_association.return_value = {"id": FAKE_SITE1_ASSOCIATION_ID} self.peer_group_association.return_value = {"id": FAKE_SITE1_ASSOCIATION_ID}
self.spm.delete_peer_group_association(self.ctx, self.association.id) self.spm.delete_peer_group_association(self.ctx, self.association.id)
Calls = [ calls = [
mock.call("Deleting association peer group 1."), mock.call(
"Deleting peer group association (association_id=1, "
"peer=SystemPeer1, peer_group_id=1)"
),
mock.call( mock.call(
f"Ignoring delete Peer Site Subcloud {subcloud.name} as " f"Ignoring delete Peer Site Subcloud {subcloud.name} as "
"is not in secondary or rehome failed state." "is not in secondary or rehome failed state."
@ -755,10 +759,10 @@ class TestSystemPeerManager(base.DCManagerTestCase):
), ),
mock.call( mock.call(
f"Deleted Subcloud Peer Group {self.peer_group.peer_group_name} " f"Deleted Subcloud Peer Group {self.peer_group.peer_group_name} "
"on peer site." "on peer site 'SystemPeer1'"
), ),
] ]
self.mock_log.info.assert_has_calls(Calls) self.mock_log.info.assert_has_calls(calls)
def test_delete_peer_group_association_uuid_does_not_match(self): def test_delete_peer_group_association_uuid_does_not_match(self):
peer = self.create_system_peer_static( peer = self.create_system_peer_static(
@ -806,7 +810,7 @@ class TestSystemPeerManager(base.DCManagerTestCase):
associations = db_api.peer_group_association_get_all(self.ctx) associations = db_api.peer_group_association_get_all(self.ctx)
self.assertEqual(1, len(associations)) self.assertEqual(1, len(associations))
self.mock_log.exception.assert_called_once_with( self.mock_log.exception.assert_called_once_with(
"Failed to delete Subcloud subcloud1 on peer site: boom" "Failed to delete subcloud 'subcloud1' on peer site 'SystemPeer1': boom"
) )
def test_delete_peer_group_association_failed(self): def test_delete_peer_group_association_failed(self):
@ -820,8 +824,8 @@ class TestSystemPeerManager(base.DCManagerTestCase):
self.association.id, self.association.id,
) )
self.mock_log.error.assert_called_once_with( self.mock_log.error.assert_called_once_with(
f"Subcloud Peer Group {self.peer_group.peer_group_name} " f"Subcloud Peer Group {self.peer_group.peer_group_name} delete "
"delete failed as it is associated with system peer on peer site." "failed as it is associated with system peer on peer site 'SystemPeer1'"
) )
def test_delete_peer_group_association_subcloud_pg_notfound(self): def test_delete_peer_group_association_subcloud_pg_notfound(self):
@ -831,7 +835,7 @@ class TestSystemPeerManager(base.DCManagerTestCase):
self.spm.delete_peer_group_association(self.ctx, self.association.id) self.spm.delete_peer_group_association(self.ctx, self.association.id)
self.mock_log.warning.assert_called_once_with( self.mock_log.warning.assert_called_once_with(
f"Subcloud Peer Group {self.peer_group.peer_group_name} " f"Subcloud Peer Group {self.peer_group.peer_group_name} "
"does not exist on peer site." "does not exist on peer site 'SystemPeer1'"
) )
associations = db_api.peer_group_association_get_all(self.ctx) associations = db_api.peer_group_association_get_all(self.ctx)
self.assertEqual(0, len(associations)) self.assertEqual(0, len(associations))