Merge "Fix DC geo-redundancy systemcontroller_gateway_address syncing"

This commit is contained in:
Zuul 2024-11-28 21:41:59 +00:00 committed by Gerrit Code Review
commit 59924988cd
10 changed files with 473 additions and 226 deletions

View File

@ -10,6 +10,7 @@ from requests_toolbelt import MultipartEncoder
from dccommon import consts
from dccommon.drivers import base
from dccommon import exceptions
from dcmanager.db.sqlalchemy import models
LOG = log.getLogger(__name__)
@ -27,6 +28,7 @@ class DcmanagerClient(base.DriverBase):
timeout=DCMANAGER_CLIENT_REST_DEFAULT_TIMEOUT,
endpoint_type=consts.KS_ENDPOINT_PUBLIC,
endpoint=None,
peer: models.SystemPeer = None,
):
if endpoint is None:
endpoint = session.get_endpoint(
@ -35,6 +37,7 @@ class DcmanagerClient(base.DriverBase):
self.endpoint = endpoint
self.timeout = timeout
self.session = session
self.peer = peer
def get_system_peer(self, system_peer_uuid):
"""Get system peer."""

View File

@ -1076,8 +1076,11 @@ class SubcloudsController(object):
systemcontroller_gateway_address.split(",")[0]
!= subcloud.systemcontroller_gateway_ip
):
# Pass bootstrap_values_dict for patch operations where these
# values aren't in payload, unlike subcloud add where they are.
# Function needs management/admin_subnet for validation
psd_common.validate_systemcontroller_gateway_address(
systemcontroller_gateway_address, payload
systemcontroller_gateway_address, bootstrap_values_dict
)
management_state = payload.get("management-state")

View File

@ -541,27 +541,27 @@ def system_peer_create(
)
def system_peer_get(context, peer_id):
def system_peer_get(context, peer_id) -> models.SystemPeer:
"""Retrieve a system_peer or raise if it does not exist."""
return IMPL.system_peer_get(context, peer_id)
def system_peer_get_by_uuid(context, uuid):
def system_peer_get_by_uuid(context, uuid) -> models.SystemPeer:
"""Retrieve a system_peer by uuid or raise if it does not exist."""
return IMPL.system_peer_get_by_uuid(context, uuid)
def system_peer_get_by_name(context, uuid):
def system_peer_get_by_name(context, uuid) -> models.SystemPeer:
"""Retrieve a system_peer by name or raise if it does not exist."""
return IMPL.system_peer_get_by_name(context, uuid)
def system_peer_get_all(context):
def system_peer_get_all(context) -> list[models.SystemPeer]:
"""Retrieve all system peers."""
return IMPL.system_peer_get_all(context)
def peer_group_get_for_system_peer(context, peer_id):
def peer_group_get_for_system_peer(context, peer_id) -> list[models.SubcloudPeerGroup]:
"""Get subcloud peer groups associated with a system peer."""
return IMPL.peer_group_get_for_system_peer(context, peer_id)
@ -656,22 +656,24 @@ def subcloud_peer_group_destroy(context, group_id):
return IMPL.subcloud_peer_group_destroy(context, group_id)
def subcloud_peer_group_get(context, group_id):
def subcloud_peer_group_get(context, group_id) -> models.SubcloudPeerGroup:
"""Retrieve a subcloud_peer_group or raise if it does not exist."""
return IMPL.subcloud_peer_group_get(context, group_id)
def subcloud_peer_group_get_by_name(context, name):
def subcloud_peer_group_get_by_name(context, name) -> models.SubcloudPeerGroup:
"""Retrieve a subcloud_peer_group by name or raise if it does not exist."""
return IMPL.subcloud_peer_group_get_by_name(context, name)
def subcloud_peer_group_get_by_leader_id(context, system_leader_id):
def subcloud_peer_group_get_by_leader_id(
context, system_leader_id
) -> list[models.SubcloudPeerGroup]:
"""Retrieve subcloud peer groups by system_leader_id."""
return IMPL.subcloud_peer_group_get_by_leader_id(context, system_leader_id)
def subcloud_get_for_peer_group(context, group_id):
def subcloud_get_for_peer_group(context, group_id) -> list[models.Subcloud]:
"""Retrieve all subclouds belonging to a subcloud_peer_group
or raise if it does not exist.
@ -679,7 +681,7 @@ def subcloud_get_for_peer_group(context, group_id):
return IMPL.subcloud_get_for_peer_group(context, group_id)
def subcloud_peer_group_get_all(context):
def subcloud_peer_group_get_all(context) -> list[models.SubcloudPeerGroup]:
"""Retrieve all subcloud peer groups."""
return IMPL.subcloud_peer_group_get_all(context)
@ -765,31 +767,35 @@ def peer_group_association_destroy(context, id):
return IMPL.peer_group_association_destroy(context, id)
def peer_group_association_get(context, id):
def peer_group_association_get(context, id) -> models.PeerGroupAssociation:
"""Retrieve a peer_group_association or raise if it does not exist."""
return IMPL.peer_group_association_get(context, id)
def peer_group_association_get_all(context):
def peer_group_association_get_all(context) -> list[models.PeerGroupAssociation]:
"""Retrieve all peer_group_associations."""
return IMPL.peer_group_association_get_all(context)
def peer_group_association_get_by_peer_group_and_system_peer_id(
context, peer_group_id, system_peer_id
):
) -> list[models.PeerGroupAssociation]:
"""Get peer group associations by peer_group_id and system_peer_id."""
return IMPL.peer_group_association_get_by_peer_group_and_system_peer_id(
context, peer_group_id, system_peer_id
)
def peer_group_association_get_by_peer_group_id(context, peer_group_id):
def peer_group_association_get_by_peer_group_id(
context, peer_group_id
) -> list[models.PeerGroupAssociation]:
"""Get the peer_group_association list by peer_group_id"""
return IMPL.peer_group_association_get_by_peer_group_id(context, peer_group_id)
def peer_group_association_get_by_system_peer_id(context, system_peer_id):
def peer_group_association_get_by_system_peer_id(
context, system_peer_id
) -> list[models.PeerGroupAssociation]:
"""Get the peer_group_association list by system_peer_id"""
return IMPL.peer_group_association_get_by_system_peer_id(context, system_peer_id)

View File

@ -1647,7 +1647,7 @@ def peer_group_association_destroy(context, association_id):
@require_context
def peer_group_association_get(context, association_id):
def peer_group_association_get(context, association_id) -> models.PeerGroupAssociation:
try:
result = (
model_query(context, models.PeerGroupAssociation)
@ -1666,7 +1666,7 @@ def peer_group_association_get(context, association_id):
@require_context
def peer_group_association_get_all(context):
def peer_group_association_get_all(context) -> list[models.PeerGroupAssociation]:
result = (
model_query(context, models.PeerGroupAssociation)
.filter_by(deleted=0)
@ -1682,7 +1682,7 @@ def peer_group_association_get_all(context):
@require_context
def peer_group_association_get_by_peer_group_and_system_peer_id(
context, peer_group_id, system_peer_id
):
) -> models.PeerGroupAssociation:
try:
result = (
model_query(context, models.PeerGroupAssociation)
@ -1705,7 +1705,9 @@ def peer_group_association_get_by_peer_group_and_system_peer_id(
@require_context
def peer_group_association_get_by_peer_group_id(context, peer_group_id):
def peer_group_association_get_by_peer_group_id(
context, peer_group_id
) -> models.PeerGroupAssociation:
result = (
model_query(context, models.PeerGroupAssociation)
.filter_by(deleted=0)
@ -1718,7 +1720,9 @@ def peer_group_association_get_by_peer_group_id(context, peer_group_id):
@require_context
def peer_group_association_get_by_system_peer_id(context, system_peer_id):
def peer_group_association_get_by_system_peer_id(
context, system_peer_id
) -> models.PeerGroupAssociation:
result = (
model_query(context, models.PeerGroupAssociation)
.filter_by(deleted=0)

View File

@ -3,8 +3,10 @@
#
# SPDX-License-Identifier: Apache-2.0
#
from __future__ import annotations
import json
import threading
from typing import TYPE_CHECKING
from fm_api import constants as fm_const
from fm_api import fm_api
@ -12,14 +14,20 @@ from oslo_config import cfg
from oslo_log import log as logging
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.dcmanager_v1 import DcmanagerClient
from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common.i18n import _
from dcmanager.common import manager
from dcmanager.common import utils
from dcmanager.db import api as db_api
from dcmanager.db.sqlalchemy import models
from dcmanager.manager.system_peer_manager import SystemPeerManager
# Use TYPE_CHECKING to avoid circular import
if TYPE_CHECKING:
from dcmanager.manager.subcloud_manager import SubcloudManager
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
@ -27,7 +35,9 @@ LOG = logging.getLogger(__name__)
class PeerGroupAuditManager(manager.Manager):
"""Manages audit related tasks."""
def __init__(self, subcloud_manager, peer_group_id, *args, **kwargs):
def __init__(
self, subcloud_manager: SubcloudManager, peer_group_id: int, *args, **kwargs
):
LOG.debug(_("PeerGroupAuditManager initialization..."))
super().__init__(service_name="peer_group_audit_manager", *args, **kwargs)
self.context = context.get_admin_context()
@ -52,8 +62,8 @@ class PeerGroupAuditManager(manager.Manager):
@staticmethod
def _get_association_sync_status_from_peer_site(
dc_client, system_peer, peer_group_id
):
dc_client: DcmanagerClient, system_peer: models.SystemPeer, peer_group_id: int
) -> str:
try:
# Get peer site system peer
dc_peer_system_peer = dc_client.get_system_peer(
@ -81,8 +91,11 @@ class PeerGroupAuditManager(manager.Manager):
)
def _get_local_subclouds_to_update_and_delete(
self, local_peer_group, remote_subclouds, remote_sync_status
):
self,
local_peer_group: models.SubcloudPeerGroup,
remote_subclouds: list[dict],
remote_sync_status: str,
) -> tuple[list[models.Subcloud], list[models.Subcloud], bool]:
local_subclouds_to_update = list()
local_subclouds_to_delete = list()
any_rehome_failed = False
@ -112,8 +125,13 @@ class PeerGroupAuditManager(manager.Manager):
# indicating any bootstrap values/address updates to
# the subcloud on the remote site.
if remote_sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC:
LOG.info(
"Peer association is out-of-sync, syncing rehome "
f"data of subcloud '{local_subcloud.name}' from "
"peer to current site"
)
self._sync_rehome_data(
local_subcloud.id, remote_subcloud.get("rehome_data")
local_subcloud, remote_subcloud.get("rehome_data")
)
elif remote_subcloud.get("deploy-status") in (
consts.DEPLOY_STATE_REHOME_FAILED,
@ -134,7 +152,7 @@ class PeerGroupAuditManager(manager.Manager):
return local_subclouds_to_update, local_subclouds_to_delete, any_rehome_failed
def _set_local_subcloud_to_secondary(self, subcloud):
def _set_local_subcloud_to_secondary(self, subcloud: models.Subcloud) -> None:
try:
LOG.info("Set local subcloud %s to secondary" % subcloud.name)
# There will be an exception when unmanage
@ -155,18 +173,42 @@ class PeerGroupAuditManager(manager.Manager):
)
raise e
def _sync_rehome_data(self, subcloud_id, rehome_data):
db_api.subcloud_update(self.context, subcloud_id, rehome_data=rehome_data)
def _sync_rehome_data(self, subcloud: models.Subcloud, rehome_data: str) -> None:
try:
remote_rehome_data = json.loads(rehome_data)
local_rehome_data = json.loads(subcloud.rehome_data)
def audit(self, system_peer, remote_peer_group, local_peer_group):
# The systemcontroller_gateway_address can't be synced from the
# peer to the local site as it's specific to each site
remote_rehome_data["saved_payload"]["systemcontroller_gateway_address"] = (
local_rehome_data["saved_payload"]["systemcontroller_gateway_address"]
)
new_rehome_data = json.dumps(remote_rehome_data)
db_api.subcloud_update(
self.context, subcloud.id, rehome_data=new_rehome_data
)
except Exception as e:
LOG.error(
"Unable to sync rehome data of subcloud "
f"'{subcloud.name}' from peer to current site: {str(e)}"
)
raise
def audit(
self,
system_peer: models.SystemPeer,
remote_peer_group: dict,
local_peer_group: models.SubcloudPeerGroup,
) -> None:
if local_peer_group.migration_status == consts.PEER_GROUP_MIGRATING:
LOG.info("Local peer group in migrating state, quit audit")
return
LOG.info(
"Auditing remote subcloud peer group:[%s] migration_status:[%s] "
"group_priority[%s], local subcloud peer group:[%s] "
"migration_status:[%s] group_priority[%s]"
"Auditing remote subcloud peer group: [%s], migration_status: [%s], "
"group_priority: [%s], local subcloud peer group: [%s], "
"migration_status: [%s], group_priority: [%s]"
% (
remote_peer_group.get("peer_group_name"),
remote_peer_group.get("migration_status"),
@ -277,8 +319,8 @@ class PeerGroupAuditManager(manager.Manager):
)
LOG.exception(
f"Failed to delete local subcloud [{subcloud.name}] that does "
"not exist under the same subcloud_peer_group on peer site, "
f"err: {e}"
"not exist under the same subcloud_peer_group on peer site "
f"{system_peer.peer_name}, err: {e}"
)
raise e
@ -313,7 +355,12 @@ class PeerGroupAuditManager(manager.Manager):
# If remote peer group migration_status is 'None'
self.require_audit_flag = False
def _clear_or_raise_alarm(self, system_peer, local_peer_group, remote_peer_group):
def _clear_or_raise_alarm(
self,
system_peer: models.SystemPeer,
local_peer_group: models.SubcloudPeerGroup,
remote_peer_group: dict,
) -> None:
# If local subcloud peer group's group_priority is
# lower than remote subcloud peer group's group_priority,
# an alarm will be raised.
@ -325,7 +372,7 @@ class PeerGroupAuditManager(manager.Manager):
if local_peer_group.group_priority < remote_peer_group.get("group_priority"):
LOG.warning(
f"Alarm: local subcloud peer group [{local_peer_group.peer_group_name}]"
f" is managed by remote system [{system_peer.peer_name}]"
f" is managed by remote system peer [{system_peer.peer_name}]"
)
try:
fault = fm_api.Fault(
@ -336,7 +383,7 @@ class PeerGroupAuditManager(manager.Manager):
severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
reason_text=(
"Subcloud peer group (peer_group_name=%s) is managed by "
"remote system (peer_uuid=%s) with a lower priority."
"remote system peer (peer_uuid=%s) with a lower priority."
% (local_peer_group.peer_group_name, system_peer.peer_uuid)
),
alarm_type=fm_const.FM_ALARM_TYPE_0,
@ -375,12 +422,12 @@ class PeerGroupAuditManager(manager.Manager):
try:
self.audit(system_peer, remote_peer_group, local_peer_group)
except Exception as e:
LOG.exception("audit error occured: %s" % e)
LOG.exception("audit error occurred: %s" % e)
def stop(self):
if self.thread:
self.thread.join()
LOG.info(f"stopped peer group {self.peer_group_id} audit thread")
LOG.info(f"Stopped peer group {self.peer_group_id} audit thread")
else:
LOG.info(f"No peer group {self.peer_group_id} audit thread to stop")
@ -402,12 +449,14 @@ class PeerGroupAuditManager(manager.Manager):
):
LOG.info(
f"Audit peer group [{local_peer_group.peer_group_name}] "
f"with remote system {system_peer.peer_name}"
f"with remote system peer {system_peer.peer_name}"
)
self.start(system_peer, remote_peer_group, local_peer_group)
@staticmethod
def send_audit_peer_group(system_peers, peer_group):
def send_audit_peer_group(
system_peers: list[models.SystemPeer], peer_group: models.SubcloudPeerGroup
):
if not system_peers:
return
local_system = utils.get_local_system()

View File

@ -9,6 +9,7 @@ import threading
from fm_api import constants as fm_const
from fm_api import fm_api
from keystoneauth1 import exceptions as ks_exceptions
from oslo_config import cfg
from oslo_log import log as logging
@ -16,15 +17,58 @@ from dcmanager.common import consts
from dcmanager.common import context
from dcmanager.common import manager
from dcmanager.db import api as db_api
from dcmanager.db.sqlalchemy import models
from dcmanager.manager import peer_group_audit_manager as pgam
from dcmanager.manager.subcloud_manager import SubcloudManager
from dcmanager.manager.system_peer_manager import SystemPeerManager
CONF = cfg.CONF
LOG = logging.getLogger(__name__)
# Upon detecting that the secondary site is REACHABLE again, the PGA sync_status
# will be set for BOTH sites by the primary site monitor thread as follows:
SECONDARY_SITE_REACHABLE_TRANSITIONS = {
# The UNKNOWN and FAILED are set when the secondary site becomes unreachable
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
consts.ASSOCIATION_SYNC_STATUS_FAILED: consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC,
# The SYNCING and IN_SYNC status can happen when first creating the peer and the
# association. The association creation will set the status to SYNCING/IN_SYNC,
# and the monitor thread might detect the peer availability after that, so we
# don't modify the status
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_SYNCING,
# A similar situation can happen with OUT_OF_SYNC, if the association/peer group
# is updated before the monitor thread runs its first check, so we don't modify
# the status
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: (
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
),
}
# Upon detecting that the secondary site is UNREACHABLE, the PGA sync_status
# will be set for the PIRMARY SITE ONLY by the primary site monitor thread as follows:
SECONDARY_SITE_UNREACHABLE_TRANSITIONS = {
# After the secondary site becomes unreachable, the only valid sync statuses
# are UNKNOWN or FAILED
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN: consts.ASSOCIATION_SYNC_STATUS_UNKNOWN,
consts.ASSOCIATION_SYNC_STATUS_FAILED: consts.ASSOCIATION_SYNC_STATUS_FAILED,
consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC: consts.ASSOCIATION_SYNC_STATUS_FAILED,
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC: consts.ASSOCIATION_SYNC_STATUS_UNKNOWN,
consts.ASSOCIATION_SYNC_STATUS_SYNCING: consts.ASSOCIATION_SYNC_STATUS_FAILED,
}
class PeerMonitor(object):
def __init__(self, peer, context, subcloud_manager):
"""Monitors and manages the connection status of a single DC system peer
The monitor runs in a separate thread and periodically checks the peer's
availability using heartbeat mechanisms. It handles both failure and recovery
scenarios, updating system states and managing fault alarms accordingly.
"""
def __init__(
self, peer: models.SystemPeer, context, subcloud_manager: SubcloudManager
):
self.peer = peer
self.thread = None
self.exit_flag = threading.Event()
@ -32,21 +76,22 @@ class PeerMonitor(object):
self.context = context
self.subcloud_manager = subcloud_manager
self.peer_group_id_set = set()
self.failure_count = 0
# key: peer_group_id
# value: PeerGroupAuditManager object
self.peer_group_audit_obj_map = dict()
self.peer_group_audit_obj_map: dict[int, pgam.PeerGroupAuditManager] = dict()
def _clear_failure(self):
alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED
entity_instance_id = "peer=%s" % self.peer.peer_uuid
entity_instance_id = f"peer={self.peer.peer_uuid}"
try:
fault = self.fm_api.get_fault(alarm_id, entity_instance_id)
if fault:
self.fm_api.clear_fault(alarm_id, entity_instance_id)
except Exception as e:
LOG.exception(
"Problem clearing fault for peer %s, alarm_id=%s error: %s"
% (self.peer.peer_uuid, alarm_id, e)
f"Problem clearing fault for peer {self.peer.peer_name}, "
f"alarm_id={alarm_id} error: {str(e)}"
)
def _raise_failure(self):
@ -114,124 +159,165 @@ class PeerMonitor(object):
if not dc_peer_subcloud_peer_group_list:
LOG.warning(
"Resource subcloud peer group of dc:%s not found"
% self.peer.manager_endpoint
f"No subcloud peer groups found for DC peer: {self.peer.peer_name} "
f"(endpoint: {self.peer.manager_endpoint})"
)
except (ks_exceptions.ConnectFailure, ks_exceptions.ConnectTimeout) as e:
# Use warning level for common connection failures to reduce log spam
LOG.warning(f"Failed to access DC peer {self.peer.peer_name}: {str(e)}")
except Exception:
LOG.exception("Failed to access the dc: %s" % self.peer.peer_name)
LOG.exception(
f"Unexpected error while accessing DC peer {self.peer.peer_name}"
)
return failed, dc_peer_subcloud_peer_group_list
def _update_sync_status_secondary_site_becomes_unreachable(self):
def _update_sync_status_secondary_site_becomes_unreachable(self) -> None:
"""Update sync status on local site when secondary site becomes unreachable."""
# Get associations by system peer
associations = SystemPeerManager.get_local_associations(self.context, self.peer)
for association in associations:
# If the association is not primary, skip it.
if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug(
"Skip update the Association sync_status as it is not primary."
LOG.info(
f"Skip updating local association (id={association.id}) "
"sync_status as it's not primary"
)
continue
# If the secondary site is down, set the association sync status
# "in-sync" -> "unknown"
# "unknown" -> "unknown"
# "out-of-sync" -> "failed"
# "syncing" -> "failed"
# "failed" -> "failed"
sync_status = consts.ASSOCIATION_SYNC_STATUS_UNKNOWN
message = f"Peer site ({self.peer.peer_name}) is unreachable."
if association.sync_status not in [
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN,
]:
sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
try:
new_sync_status = SECONDARY_SITE_UNREACHABLE_TRANSITIONS[
association.sync_status
]
except KeyError:
# This should never happen
LOG.error(
f"Unexpected sync status on association id={association.id}: "
f"{association.sync_status}. Updating the sync status to "
f"{consts.ASSOCIATION_SYNC_STATUS_FAILED}"
)
new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
LOG.info(
f"Updating local association (id={association.id}) sync_status "
f"from {association.sync_status} to {new_sync_status} due "
"to secondary site becoming unreachable"
)
db_api.peer_group_association_update(
self.context,
association.id,
sync_status=sync_status,
sync_message=message,
sync_status=new_sync_status,
sync_message=f"Peer site ({self.peer.peer_name}) is unreachable.",
)
def _update_sync_status_secondary_site_becomes_reachable(self):
def _update_sync_status_secondary_site_becomes_reachable(self) -> None:
"""Update sync status on both sites when secondary site becomes reachable."""
# Get associations by system peer
associations = SystemPeerManager.get_local_associations(self.context, self.peer)
for association in associations:
# If the association is not primary, skip it.
if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug(
"Skip update Peer Site Association sync_status as "
"current site Association is not primary."
LOG.info(
"Skip updating association sync_status on both sites as "
"current site association is not primary."
)
continue
# Upon detecting that the secondary site is reachable again,
# the PGA sync_status will be set for both sites by the primary
# site monitor thread as follows:
# "unknown" -> "in-sync"
# "failed" -> "out-of-sync"
sync_status = consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
if association.sync_status == consts.ASSOCIATION_SYNC_STATUS_UNKNOWN:
sync_status = consts.ASSOCIATION_SYNC_STATUS_IN_SYNC
try:
new_sync_status = SECONDARY_SITE_REACHABLE_TRANSITIONS[
association.sync_status
]
except KeyError:
# This should never happen
LOG.error(
f"Unexpected sync status on association id={association.id}: "
f"{association.sync_status}. Updating the sync status to "
f"{consts.ASSOCIATION_SYNC_STATUS_FAILED}"
)
new_sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
dc_local_pg = db_api.subcloud_peer_group_get(
self.context, association.peer_group_id
)
LOG.info(
f"Updating local association (id={association.id}) sync_status "
f"from {association.sync_status} to {new_sync_status} due "
"to secondary site becoming reachable"
)
SystemPeerManager.update_sync_status(
self.context,
self.peer,
sync_status,
new_sync_status,
dc_local_pg,
association=association,
)
def _do_monitor_peer(self):
failure_count = 0
LOG.info("Start monitoring thread for peer %s" % self.peer.peer_name)
UNAVAILABLE_STATE = consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE
AVAILABLE_STATE = consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE
# Do the actual peer monitor.
def _update_peer_state(self, state: str) -> models.SystemPeer:
"""Update peer availability state in database."""
return db_api.system_peer_update(
self.context, self.peer.id, availability_state=state
)
def _handle_peer_failure(self) -> None:
"""Handle peer failure by raising alarms and updating states."""
self.failure_count += 1
if self.failure_count >= self.peer.heartbeat_failure_threshold:
LOG.warning(
f"DC peer '{self.peer.peer_name}' heartbeat failed, raising alarm"
)
self._raise_failure()
self._update_peer_state(consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE)
self._update_sync_status_secondary_site_becomes_unreachable()
self.failure_count = 0
self._set_require_audit_flag_to_associated_peer_groups()
def _handle_peer_recovery(self, remote_pg_list: list) -> None:
"""Handle peer recovery by clearing alarms and restoring states."""
self.failure_count = 0
self._audit_local_peer_groups(remote_pg_list)
if (
self.peer.availability_state
!= consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE
):
LOG.info(f"DC peer '{self.peer.peer_name}' is back online, clearing alarm")
self._update_peer_state(consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE)
self._update_sync_status_secondary_site_becomes_reachable()
self._clear_failure()
def _check_peer_status(self) -> None:
"""Perform a single peer status check and handle the result."""
self.peer = db_api.system_peer_get(self.context, self.peer.id)
failed, remote_pg_list = self._heartbeat_check_via_get_peer_group_list()
if failed:
self._handle_peer_failure()
else:
self._handle_peer_recovery(remote_pg_list)
def _do_monitor_peer(self) -> None:
"""Monitor peer connectivity and handle state changes."""
LOG.info(f"Start monitoring thread for peer '{self.peer.peer_name}'")
# Run first check immediately
try:
self._check_peer_status()
except Exception as e:
LOG.exception(f"Initial check failed for peer '{self.peer.peer_name}': {e}")
# Continue with periodic checks
while not self.exit_flag.wait(timeout=self.peer.heartbeat_interval):
try:
# Get system peer from DB
self.peer = db_api.system_peer_get(self.context, self.peer.id)
failed, remote_pg_list = self._heartbeat_check_via_get_peer_group_list()
if failed:
failure_count += 1
if failure_count >= self.peer.heartbeat_failure_threshold:
# heartbeat_failure_threshold reached.
LOG.warning(
"DC %s heartbeat failed, Raising alarm"
% self.peer.peer_name
)
self._raise_failure()
db_api.system_peer_update(
self.context,
self.peer.id,
availability_state=UNAVAILABLE_STATE,
)
# pylint: disable=line-too-long
self._update_sync_status_secondary_site_becomes_unreachable()
failure_count = 0
self._set_require_audit_flag_to_associated_peer_groups()
else:
failure_count = 0
self._audit_local_peer_groups(remote_pg_list)
if self.peer.availability_state != AVAILABLE_STATE:
db_api.system_peer_update(
self.context,
self.peer.id,
availability_state=AVAILABLE_STATE,
)
# pylint: disable=line-too-long
self._update_sync_status_secondary_site_becomes_reachable()
LOG.info("DC %s back online, clear alarm" % self.peer.peer_name)
self._clear_failure()
self._check_peer_status()
except Exception as e:
LOG.exception(
"Got exception monitoring peer %s error: %s"
% (self.peer.peer_name, e)
f"Unexpected error monitoring peer '{self.peer.peer_name}': {e}"
)
LOG.info(
"Caught graceful exit signal for peer monitor %s" % self.peer.peer_name
)
LOG.info(f"Gracefully exiting monitor thread for peer '{self.peer.peer_name}'")
def _audit_local_peer_groups(self, remote_pg_list):
# Generate a dict index by remote peer group name
@ -266,7 +352,9 @@ class PeerMonitor(object):
for pgam_obj in self.peer_group_audit_obj_map.values():
pgam_obj.require_audit_flag = True
def audit_specific_local_peer_group(self, peer_group, remote_peer_group):
def audit_specific_local_peer_group(
self, peer_group: models.SubcloudPeerGroup, remote_peer_group
):
msg = None
if peer_group.id in self.peer_group_audit_obj_map:
pgam_obj = self.peer_group_audit_obj_map[peer_group.id]
@ -324,17 +412,16 @@ class PeerMonitor(object):
class PeerMonitorManager(manager.Manager):
"""Manages tasks related to peer monitor."""
def __init__(self, subcloud_manager):
def __init__(self, subcloud_manager: SubcloudManager):
LOG.debug("PeerMonitorManager initialization...")
super(PeerMonitorManager, self).__init__(service_name="peer_monitor_manager")
self.peer_monitor = dict()
self.context = context.get_admin_context()
self.subcloud_manager = subcloud_manager
# key: system_peer_id
# value: PeerMonitor object
self.peer_monitor_thread_map = dict()
self.peer_monitor_thread_map: dict[int, PeerMonitor] = dict()
def _remove_peer_monitor_task(self, system_peer_id):
peer_mon_obj = self.peer_monitor_thread_map[system_peer_id]

View File

@ -3674,6 +3674,11 @@ class SubcloudManager(manager.Manager):
and systemcontroller_gateway_ip.split(",")[0]
!= subcloud.systemcontroller_gateway_ip
):
LOG.info(
f"The systemcontroller_gateway_ip for subcloud {subcloud.name} "
f"was updated from {subcloud.systemcontroller_gateway_ip} to "
f"{systemcontroller_gateway_ip.split(',')[0]}. Replacing routes..."
)
m_ks_client = OpenStackDriver(
region_name=dccommon_consts.DEFAULT_REGION_NAME,
region_clients=None,

View File

@ -9,6 +9,7 @@ from contextlib import nullcontext
import functools
import json
import tempfile
from typing import Optional
from eventlet import greenpool
from oslo_log import log as logging
@ -54,7 +55,9 @@ class SystemPeerManager(manager.Manager):
)
@staticmethod
def get_local_associations(ctx, peer, local_pg=None):
def get_local_associations(
ctx, peer: models.SystemPeer, local_pg: models.SubcloudPeerGroup = None
) -> list[models.PeerGroupAssociation]:
if local_pg is None:
# Get associations by system peer id
return db_api.peer_group_association_get_by_system_peer_id(ctx, peer.id)
@ -93,8 +96,12 @@ class SystemPeerManager(manager.Manager):
"""
def _update_association_on_peer_site(
peer, sync_status, local_pg, remote_pg, message
):
peer: models.SystemPeer,
sync_status: str,
local_pg: models.SubcloudPeerGroup,
remote_pg: dict,
message: str,
) -> tuple[str, str]:
try:
# Get peer site dcmanager client
dc_client = SystemPeerManager.get_peer_dc_client(peer)
@ -118,21 +125,21 @@ class SystemPeerManager(manager.Manager):
# Update peer site association sync_status only if the
# sync_status is different from the current sync_status
if dc_peer_association.get("sync_status") != sync_status:
if dc_peer_association.get("sync-status") != sync_status:
# Update peer site association sync_status
dc_peer_association_id = dc_peer_association.get("id")
dc_client.update_peer_group_association_sync_status(
dc_peer_association_id, sync_status
)
LOG.info(
f"Updated Peer site {dc_peer_system_peer.get('id')} "
f"Peer Group Association {dc_peer_association_id} "
f"sync_status to {sync_status}."
f"Updated peer site '{dc_peer_system_peer.get('peer-name')}' "
f"peer group association {dc_peer_association_id} "
f"sync_status to '{sync_status}'"
)
except Exception as e:
message = (
f"Failed to Update Peer Site ({peer.peer_uuid}) "
f"Association sync_status to {sync_status}."
f"Failed to update peer site '{peer.peer_name}' "
f"association sync_status to '{sync_status}'"
)
LOG.exception(f"{message} Error: {e}")
sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
@ -146,9 +153,9 @@ class SystemPeerManager(manager.Manager):
for association in associations:
if association.association_type == consts.ASSOCIATION_TYPE_NON_PRIMARY:
LOG.debug(
f"Skip update Peer Site association sync_status to {sync_status} "
"as current site Association is not primary."
LOG.info(
"Skipping updating peer and local site association "
f"sync_status to '{sync_status}' as local site is not primary"
)
continue
@ -165,13 +172,16 @@ class SystemPeerManager(manager.Manager):
association.sync_status == sync_status
and sync_status != consts.ASSOCIATION_SYNC_STATUS_FAILED
):
LOG.debug(
"Skip update current site association sync_status to "
f"{sync_status} as current site Association is already "
"in the same status."
LOG.info(
f"Local association {association.id} sync_status "
f"already set to '{sync_status}', skipping update"
)
continue
# Update primary site association sync_status
LOG.info(
f"Updating local association {association.id} sync_status "
f"from '{association.sync_status}' to '{sync_status}'"
)
db_api.peer_group_association_update(
ctx, association.id, sync_status=sync_status, sync_message=message
)
@ -222,10 +232,11 @@ class SystemPeerManager(manager.Manager):
dccommon_consts.SYSTEM_CONTROLLER_NAME,
p_ks_client.session,
endpoint=dc_endpoint,
peer=peer,
)
@staticmethod
def get_peer_subcloud(dc_client, subcloud_name):
def get_peer_subcloud(dc_client: DcmanagerClient, subcloud_name: str) -> dict:
"""Get subcloud on peer site if exist.
:param dc_client: the dcmanager client object
@ -235,7 +246,10 @@ class SystemPeerManager(manager.Manager):
peer_subcloud = dc_client.get_subcloud(subcloud_name)
return peer_subcloud
except dccommon_exceptions.SubcloudNotFound:
LOG.warn(f"Subcloud {subcloud_name} does not exist on peer site.")
LOG.warn(
f"Subcloud '{subcloud_name}' does not exist on "
f"peer site '{dc_client.peer.peer_name}'"
)
@staticmethod
def get_subcloud_deploy_status(subcloud):
@ -258,7 +272,9 @@ class SystemPeerManager(manager.Manager):
return True
@staticmethod
def delete_peer_secondary_subcloud(dc_client, subcloud_ref):
def delete_peer_secondary_subcloud(
dc_client: DcmanagerClient, subcloud_ref: str
) -> None:
"""Delete secondary subcloud on peer site.
:param dc_client: the dcmanager client object
@ -284,14 +300,17 @@ class SystemPeerManager(manager.Manager):
return
dc_client.delete_subcloud(subcloud_ref)
LOG.info(f"Deleted Subcloud {subcloud_ref} on peer site.")
LOG.info(
f"Deleted subcloud '{subcloud_ref}' on peer site "
f"'{dc_client.peer.peer_name}'"
)
@staticmethod
def _run_parallel_group_operation(op_type, op_function, thread_pool, subclouds):
"""Run parallel group operation on subclouds."""
failed_subclouds = []
processed = 0
error_msg = {} # Dictinary to store error message for each subcloud
error_msg = {} # Dictionary to store error message for each subcloud
for subcloud, success in thread_pool.imap(op_function, subclouds):
processed += 1
@ -365,9 +384,9 @@ class SystemPeerManager(manager.Manager):
region_name, files, data, is_region_name=True
)
LOG.info(
f"Updated Subcloud {dc_peer_subcloud.get('name')} "
f"Updated subcloud '{dc_peer_subcloud.get('name')}' "
f"(region_name: {dc_peer_subcloud.get('region-name')}) "
"on peer site."
f"on peer site '{dc_client.peer.peer_name}'"
)
else:
# Create subcloud on peer site if not exist
@ -375,13 +394,15 @@ class SystemPeerManager(manager.Manager):
files, data
)
LOG.info(
f"Created Subcloud {dc_peer_subcloud.get('name')} "
f"Created subcloud '{dc_peer_subcloud.get('name')}' "
f"(region_name: {dc_peer_subcloud.get('region-name')}) "
"on peer site."
f"on peer site '{dc_client.peer.peer_name}'"
)
LOG.debug(
f"Updating subcloud {subcloud_name} (region_name: {region_name}) "
f"with subcloud peer group id {dc_peer_pg_id} on peer site."
LOG.info(
f"Updating subcloud '{subcloud_name}' (region_name: "
f"{dc_peer_subcloud.get('region-name')}) with subcloud "
f"peer group id {dc_peer_pg_id} on peer site "
f"'{dc_client.peer.peer_name}'"
)
# Update subcloud associated peer group on peer site.
# The peer_group update will check the header and should
@ -413,12 +434,15 @@ class SystemPeerManager(manager.Manager):
except Exception as e:
subcloud.msg = str(e) # Store error message for subcloud
LOG.error(
f"Failed to add/update Subcloud {subcloud_name} "
f"(region_name: {region_name}) on peer site: {str(e)}"
f"Failed to add/update subcloud '{subcloud_name}' "
f"(region_name: {region_name}) on peer site "
f"'{dc_client.peer.peer_name}': {str(e)}"
)
return subcloud, False
def _delete_subcloud(self, dc_client, subcloud):
def _delete_subcloud(
self, dc_client: DcmanagerClient, subcloud: models.Subcloud
) -> tuple[models.Subcloud, bool]:
"""Delete subcloud on peer site in parallel."""
try:
subcloud_name = subcloud.get("name")
@ -427,11 +451,12 @@ class SystemPeerManager(manager.Manager):
except Exception as e:
subcloud.msg = str(e)
LOG.exception(
f"Failed to delete Subcloud {subcloud_name} on peer site: {str(e)}"
f"Failed to delete subcloud '{subcloud_name}' on "
f"peer site '{dc_client.peer.peer_name}': {str(e)}"
)
return subcloud, False
def _is_valid_for_subcloud_sync(self, subcloud):
def _is_valid_for_subcloud_sync(self, subcloud: models.Subcloud) -> str:
"""Verify subcloud data for sync."""
subcloud_name = subcloud.get("name")
region_name = subcloud.get("region_name")
@ -485,10 +510,12 @@ class SystemPeerManager(manager.Manager):
return VERIFY_SUBCLOUD_SYNC_VALID
def _validate_subclouds_for_sync(self, subclouds, dc_client):
def _validate_subclouds_for_sync(
self, subclouds: list[models.Subcloud], dc_client: DcmanagerClient
) -> tuple[list[models.Subcloud], dict[str, str]]:
"""Validate subclouds for sync."""
valid_subclouds = []
error_msg = {} # Dictinary to store error message for each subcloud
error_msg = {} # Dictionary to store error message for each subcloud
for subcloud in subclouds:
subcloud_name = subcloud.get("name")
@ -510,8 +537,8 @@ class SystemPeerManager(manager.Manager):
peer_subcloud = self.get_peer_subcloud(dc_client, subcloud_name)
if not peer_subcloud:
LOG.info(
f"Subcloud {subcloud_name} (region_name: {region_name}) does "
"not exist on peer site."
f"Subcloud '{subcloud_name}' (region_name: {region_name}) does "
f"not exist on peer site '{dc_client.peer.peer_name}'"
)
valid_subclouds.append(subcloud)
continue
@ -541,7 +568,9 @@ class SystemPeerManager(manager.Manager):
return valid_subclouds, error_msg
def _sync_subclouds(self, context, peer, dc_local_pg_id, dc_peer_pg_id):
def _sync_subclouds(
self, context, peer: models.SystemPeer, dc_local_pg_id: int, dc_peer_pg_id: int
) -> dict[str, str]:
"""Sync subclouds of local peer group to peer site.
:param context: request context object
@ -572,7 +601,10 @@ class SystemPeerManager(manager.Manager):
)
error_msg.update(sync_error_msg)
LOG.info("Subcloud peer sync operation finished")
LOG.info(
f"Completed subcloud peer sync with peer '{peer.peer_name}': processed "
f"{len(subclouds_to_sync)} subclouds, {len(failed_subclouds)} failed"
)
dc_local_region_names = set()
for subcloud in subclouds:
@ -591,7 +623,10 @@ class SystemPeerManager(manager.Manager):
dc_peer_subcloud_diff_names = dc_peer_region_names - dc_local_region_names
for subcloud_to_delete in dc_peer_subcloud_diff_names:
try:
LOG.debug(f"Deleting Subcloud name {subcloud_to_delete} on peer site.")
LOG.info(
f"Deleting subcloud '{subcloud_to_delete}' on "
f"peer site '{dc_client.peer.peer_name}'"
)
self.delete_peer_secondary_subcloud(dc_client, subcloud_to_delete)
except Exception as e:
msg = f"Subcloud delete failed: {str(e)}"
@ -664,7 +699,7 @@ class SystemPeerManager(manager.Manager):
context, peer_group_id
)
if not associations:
LOG.debug("No association found for peer group %s" % peer_group_id)
LOG.info(f"No association found for peer group '{peer_group_id}'")
else:
for association in associations:
if sync_status == consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC:
@ -722,7 +757,7 @@ class SystemPeerManager(manager.Manager):
sync_status=new_sync_status,
sync_message=new_sync_message,
)
LOG.debug(
LOG.info(
f"Updated Local Peer Group Association {association.id} "
f"sync_status to {new_sync_status}."
)
@ -786,8 +821,11 @@ class SystemPeerManager(manager.Manager):
return success_peer_ids, failed_peer_ids
def _get_non_primary_association(
self, dc_client, dc_peer_system_peer_id, dc_peer_pg_id
):
self,
dc_client: DcmanagerClient,
dc_peer_system_peer_id: int,
dc_peer_pg_id: int,
) -> Optional[dict]:
"""Get non-primary Association from peer site."""
try:
return dc_client.get_peer_group_association_with_peer_id_and_pg_id(
@ -795,18 +833,23 @@ class SystemPeerManager(manager.Manager):
)
except dccommon_exceptions.PeerGroupAssociationNotFound:
LOG.error(
"Peer Group association does not exist on peer site. "
f"Peer Group ID: {dc_peer_pg_id}, Peer System Peer ID: "
f"{dc_peer_system_peer_id}"
"Peer group association does not exist on peer site "
f"'{dc_client.peer.peer_name}'. Peer group ID: {dc_peer_pg_id}, "
f"peer system peer ID: {dc_peer_system_peer_id}"
)
return None
def _get_peer_site_pg_by_name(self, dc_client, peer_group_name):
def _get_peer_site_pg_by_name(
self, dc_client: DcmanagerClient, peer_group_name: str
) -> Optional[dict]:
"""Get remote Peer Group from peer site by name."""
try:
return dc_client.get_subcloud_peer_group(peer_group_name)
except dccommon_exceptions.SubcloudPeerGroupNotFound:
LOG.error(f"Peer Group {peer_group_name} does not exist on peer site.")
LOG.error(
f"Peer group '{peer_group_name}' does not exist on "
f"peer site '{dc_client.peer.peer_name}'"
)
return None
def _get_peer_site_system_peer(self, dc_client, peer_uuid=None):
@ -838,11 +881,6 @@ class SystemPeerManager(manager.Manager):
:param association_id: id of association to sync
:param sync_subclouds: Enabled to sync subclouds to peer site
"""
LOG.info(
f"Synchronize the association {association_id} of the "
"Subcloud Peer Group with the System Peer pointing to the "
"peer site."
)
association = db_api.peer_group_association_get(context, association_id)
peer = db_api.system_peer_get(context, association.system_peer_id)
@ -850,14 +888,19 @@ class SystemPeerManager(manager.Manager):
peer_group_name = dc_local_pg.peer_group_name
dc_peer_association_id = None
LOG.info(
f"Syncing peer group association {association_id} that links subcloud "
f"peer group '{peer_group_name}' with system peer '{peer.peer_name}'"
)
try:
# Check if the system_uuid of the peer site matches with the
# peer_uuid
system = self.get_peer_sysinv_client(peer).get_system()
if system.uuid != peer.peer_uuid:
LOG.error(
f"Peer site system uuid {system.uuid} does not match "
f"with the peer_uuid {peer.peer_uuid}"
f"Peer site '{peer.peer_name}' system uuid {system.uuid} does "
f"not match the configured peer_uuid: {peer.peer_uuid}"
)
raise exceptions.PeerGroupAssociationTargetNotMatch(uuid=system.uuid)
@ -873,7 +916,8 @@ class SystemPeerManager(manager.Manager):
if dc_peer_system_peer is None:
failed_message = (
f"System Peer {local_system_uuid} does not exist on peer site."
f"A system peer with UUID of {local_system_uuid} "
f"does not exist on peer site '{peer.peer_name}'"
)
return db_api.peer_group_association_db_model_to_dict(
self._update_sync_status_to_failed(
@ -896,7 +940,7 @@ class SystemPeerManager(manager.Manager):
dc_peer_pg = dc_client.add_subcloud_peer_group(**peer_group_kwargs)
LOG.info(
f"Created Subcloud Peer Group {peer_group_name} on "
f"peer site. ID is {dc_peer_pg.get('id')}."
f"peer site '{peer.peer_name}'. ID is {dc_peer_pg.get('id')}."
)
dc_peer_pg_id = dc_peer_pg.get("id")
dc_peer_pg_priority = dc_peer_pg.get("group_priority")
@ -921,7 +965,7 @@ class SystemPeerManager(manager.Manager):
)
LOG.info(
"Created 'non-primary' Peer Group Association "
f"{dc_peer_association.get('id')} on peer site."
f"{dc_peer_association.get('id')} on peer site '{peer.peer_name}'"
)
dc_peer_association_id = dc_peer_association.get("id")
@ -943,7 +987,7 @@ class SystemPeerManager(manager.Manager):
)
LOG.info(
f"Updated Subcloud Peer Group {peer_group_name} on "
f"peer site, ID is {dc_peer_pg.get('id')}."
f"peer site '{peer.peer_name}', ID is {dc_peer_pg.get('id')}"
)
association_update = {
@ -998,7 +1042,6 @@ class SystemPeerManager(manager.Manager):
:param context: request context object.
:param association_id: id of association to delete
"""
LOG.info(f"Deleting association peer group {association_id}.")
# Retrieve the peer group association details from the database
association = db_api.peer_group_association_get(context, association_id)
@ -1006,6 +1049,11 @@ class SystemPeerManager(manager.Manager):
dc_local_pg = db_api.subcloud_peer_group_get(context, association.peer_group_id)
peer_group_name = dc_local_pg.peer_group_name
LOG.info(
f"Deleting peer group association ({association_id=}, "
f"peer={peer.peer_name}, peer_group_id={dc_local_pg.id})"
)
try:
# Check if the system_uuid of the peer site matches with the
# peer_uuid
@ -1035,7 +1083,7 @@ class SystemPeerManager(manager.Manager):
# be deleted
LOG.warning(
f"Subcloud Peer Group {peer_group_name} does "
"not exist on peer site."
f"not exist on peer site '{peer.peer_name}'"
)
return self._delete_primary_association(context, association_id)
@ -1045,7 +1093,7 @@ class SystemPeerManager(manager.Manager):
if dc_peer_pg_priority == 0:
LOG.error(
f"Failed to delete peer_group_association. Peer Group: "
f"{peer_group_name} has priority 0 on peer site."
f"{peer_group_name} has priority 0 on peer site '{peer.peer_name}'"
)
raise exceptions.SubcloudPeerGroupHasWrongPriority(
priority=dc_peer_pg_priority
@ -1064,11 +1112,13 @@ class SystemPeerManager(manager.Manager):
if delete_error_msg:
LOG.error(
"Failed to delete subcloud(s) from the Subcloud Peer Group "
f"{peer_group_name} on peer site: {json.dumps(delete_error_msg)}"
f"'{peer_group_name}' on peer site '{peer.peer_name}': "
f"{json.dumps(delete_error_msg)}"
)
sync_message = (
f"Deletion of {list(delete_error_msg.keys())} from the "
f"Subcloud Peer Group: {peer_group_name} on the peer site failed."
f"Subcloud Peer Group: {peer_group_name} on the peer site "
f"'{peer.peer_name}' failed."
)
self._update_sync_status_to_failed(
context, association_id, sync_message
@ -1080,12 +1130,14 @@ class SystemPeerManager(manager.Manager):
try:
dc_client.delete_subcloud_peer_group(peer_group_name)
LOG.info(
f"Deleted Subcloud Peer Group {peer_group_name} on peer site."
f"Deleted Subcloud Peer Group {peer_group_name} on "
f"peer site '{peer.peer_name}'"
)
except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated:
LOG.error(
f"Subcloud Peer Group {peer_group_name} delete failed "
"as it is associated with System Peer on peer site."
f"as it is associated with System Peer on peer site "
f"'{peer.peer_name}'"
)
return self._delete_primary_association(context, association_id)
dc_peer_system_peer_id = dc_peer_system_peer.get("id")
@ -1100,18 +1152,21 @@ class SystemPeerManager(manager.Manager):
dc_client.delete_peer_group_association(dc_peer_association_id)
elif dc_peer_association is None:
LOG.warning(
f"PeerGroupAssociation does not exist on peer site. "
f"Peer Group ID: {dc_peer_pg_id}, peer site System "
f"Peer ID: {dc_peer_system_peer_id}"
f"PeerGroupAssociation does not exist on peer site "
f"'{peer.peer_name}'. Peer Group ID: {dc_peer_pg_id}, "
f"peer site System Peer ID: {dc_peer_system_peer_id}"
)
try:
dc_client.delete_subcloud_peer_group(peer_group_name)
LOG.info(f"Deleted Subcloud Peer Group {peer_group_name} on peer site.")
LOG.info(
f"Deleted Subcloud Peer Group {peer_group_name} on "
f"peer site '{peer.peer_name}'"
)
except dccommon_exceptions.SubcloudPeerGroupDeleteFailedAssociated:
failed_message = (
f"Subcloud Peer Group {peer_group_name} delete failed as it "
"is associated with system peer on peer site."
f"is associated with system peer on peer site '{peer.peer_name}'"
)
self._update_sync_status_to_failed(
context, association_id, failed_message

View File

@ -107,26 +107,57 @@ class TestPeerMonitor(base.DCManagerTestCase):
)
def test_update_sync_status_and_association_is_non_primary(self):
# Delete the primary association created during setUp
db_api.peer_group_association_destroy(self.ctx, self.association.id)
# Create a non-primary association
association = self.system_peer_manager.create_peer_group_association_static(
self.ctx,
system_peer_id=self.peer.id,
peer_group_id=self.peer_group2.id,
association_type=consts.ASSOCIATION_TYPE_NON_PRIMARY,
sync_status=consts.ASSOCIATION_SYNC_STATUS_UNKNOWN,
)
self.mock_get_peer_dc_client().get_subcloud_peer_group.return_value = {
"id": FAKE_SITE1_PEER_GROUP_ID
}
# Test the case where the association is non-primary
self.peer_monitor._update_sync_status_secondary_site_becomes_reachable()
self.mock_get_peer_dc_client().get_subcloud_peer_group.assert_called_once_with(
self.peer_group1.peer_group_name
# Mock update_sync_status
mock_patch = mock.patch.object(
peer_monitor_manager.SystemPeerManager, "update_sync_status"
)
mock_update_sync_status = mock_patch.start()
self.addCleanup(mock_patch.stop)
self.peer_monitor._update_sync_status_secondary_site_becomes_reachable()
# Assert that the association sync status was not updated
mock_update_sync_status.assert_not_called()
association_new = db_api.peer_group_association_get(self.ctx, association.id)
self.assertEqual(
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN, association_new.sync_status
)
def test_update_sync_status_and_association_is_in_sync(self):
# Mock update_sync_status
mock_patch = mock.patch.object(
peer_monitor_manager.SystemPeerManager, "update_sync_status"
)
mock_update_sync_status = mock_patch.start()
self.addCleanup(mock_patch.stop)
self.peer_monitor._update_sync_status_secondary_site_becomes_reachable()
# Assert that the association sync status was not updated as it's
# already in-sync
association_new = db_api.peer_group_association_get(
self.ctx, self.association.id
)
self.assertEqual(
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC, association_new.sync_status
)
# But the update_sync_status still has to be triggered so it handles any
# sync message update
mock_update_sync_status.assert_called_once()
def test_update_sync_status_secondary_site_becomes_reachable(self):
self.mock_get_local_system.return_value = test_system_peer_manager.FakeSystem(
FAKE_SITE0_SYSTEM_UUID
@ -286,7 +317,7 @@ class TestPeerMonitor(base.DCManagerTestCase):
mock_event.side_effect = [False, False, True]
self.peer_monitor._do_monitor_peer()
self.mock_log.exception.assert_called_with(
f"Problem clearing fault for peer {self.peer.peer_uuid}, alarm_id="
f"Problem clearing fault for peer {self.peer.peer_name}, alarm_id="
f"{fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED} error: boom"
)
@ -319,7 +350,7 @@ class TestPeerMonitor(base.DCManagerTestCase):
mock_event.side_effect = [False, False, True]
self.peer_monitor._do_monitor_peer()
self.mock_log.exception.assert_called_with(
"Got exception monitoring peer PeerSite1 error: boom"
"Unexpected error monitoring peer 'PeerSite1': boom"
)
def test_heartbeat_check_via_get_peer_group_list_pg_not_found(self):
@ -327,8 +358,8 @@ class TestPeerMonitor(base.DCManagerTestCase):
ret = self.peer_monitor._heartbeat_check_via_get_peer_group_list()
self.mock_get_peer_dc_client.assert_called()
self.mock_log.warning.assert_called_once_with(
"Resource subcloud peer group of dc:"
"http://128.128.128.128:5000/v3 not found"
"No subcloud peer groups found for DC peer: PeerSite1 "
"(endpoint: http://128.128.128.128:5000/v3)"
)
self.assertEqual((False, []), ret)

View File

@ -143,6 +143,7 @@ class TestSystemPeerManager(base.DCManagerTestCase):
mock_patch = mock.patch.object(system_peer_manager, "DcmanagerClient")
self.mock_dc_client = mock_patch.start()
self.mock_dc_client().peer.peer_name = "SystemPeer1"
self.addCleanup(mock_patch.stop)
def _mock_system_peer_manager_peersitedriver(self):
@ -474,7 +475,7 @@ class TestSystemPeerManager(base.DCManagerTestCase):
"dcmanager.manager.system_peer_manager.SystemPeerManager.get_peer_dc_client"
)
def test_update_sync_status_exception(self, mock_client):
mock_client.return_value = Exception("boom")
mock_client.side_effect = Exception("boom")
self.spm.update_sync_status(
self.ctx, self.peer, consts.ASSOCIATION_SYNC_STATUS_IN_SYNC
)
@ -673,8 +674,8 @@ class TestSystemPeerManager(base.DCManagerTestCase):
self.ctx, self.peer, self.peer_group.id, FAKE_SITE1_PEER_GROUP_ID
)
self.mock_log.error.assert_called_once_with(
f"Failed to add/update Subcloud {subcloud1.name} "
f"(region_name: {subcloud1.region_name}) on peer site: boom"
f"Failed to add/update subcloud '{subcloud1.name}' "
f"(region_name: {subcloud1.region_name}) on peer site 'SystemPeer1': boom"
)
def test_sync_subclouds_delete_subcloud_exception(self):
@ -743,8 +744,11 @@ class TestSystemPeerManager(base.DCManagerTestCase):
self.mock_dc_client().get_subcloud.side_effect = [peer_subcloud4]
self.peer_group_association.return_value = {"id": FAKE_SITE1_ASSOCIATION_ID}
self.spm.delete_peer_group_association(self.ctx, self.association.id)
Calls = [
mock.call("Deleting association peer group 1."),
calls = [
mock.call(
"Deleting peer group association (association_id=1, "
"peer=SystemPeer1, peer_group_id=1)"
),
mock.call(
f"Ignoring delete Peer Site Subcloud {subcloud.name} as "
"is not in secondary or rehome failed state."
@ -755,10 +759,10 @@ class TestSystemPeerManager(base.DCManagerTestCase):
),
mock.call(
f"Deleted Subcloud Peer Group {self.peer_group.peer_group_name} "
"on peer site."
"on peer site 'SystemPeer1'"
),
]
self.mock_log.info.assert_has_calls(Calls)
self.mock_log.info.assert_has_calls(calls)
def test_delete_peer_group_association_uuid_does_not_match(self):
peer = self.create_system_peer_static(
@ -806,7 +810,7 @@ class TestSystemPeerManager(base.DCManagerTestCase):
associations = db_api.peer_group_association_get_all(self.ctx)
self.assertEqual(1, len(associations))
self.mock_log.exception.assert_called_once_with(
"Failed to delete Subcloud subcloud1 on peer site: boom"
"Failed to delete subcloud 'subcloud1' on peer site 'SystemPeer1': boom"
)
def test_delete_peer_group_association_failed(self):
@ -820,8 +824,8 @@ class TestSystemPeerManager(base.DCManagerTestCase):
self.association.id,
)
self.mock_log.error.assert_called_once_with(
f"Subcloud Peer Group {self.peer_group.peer_group_name} "
"delete failed as it is associated with system peer on peer site."
f"Subcloud Peer Group {self.peer_group.peer_group_name} delete "
"failed as it is associated with system peer on peer site 'SystemPeer1'"
)
def test_delete_peer_group_association_subcloud_pg_notfound(self):
@ -831,7 +835,7 @@ class TestSystemPeerManager(base.DCManagerTestCase):
self.spm.delete_peer_group_association(self.ctx, self.association.id)
self.mock_log.warning.assert_called_once_with(
f"Subcloud Peer Group {self.peer_group.peer_group_name} "
"does not exist on peer site."
"does not exist on peer site 'SystemPeer1'"
)
associations = db_api.peer_group_association_get_all(self.ctx)
self.assertEqual(0, len(associations))