
Fix the reported issue: The PGA fails to transition from 'in-sync' to 'out-of-sync' after updating subclouds. This commit includes the following changes: 1. Updates the SPG sync_status to 'out-of-sync' after performing any of the following operations and provides an informative message to the operator: a) Adding a subcloud to the SPG b) Removing a subcloud from the SPG c) Updating a subcloud in the SPG 2. Ensures that updates on SPG attributes, such as name, max_subcloud_rehoming, and group_state, are automatically propagated to the peer site. Test plan: Pre-Steps: 1. Create the system peer from Site A to Site B 2. Create System peer from Site B to Site A 3. Create the subcloud peer group in the Site A 4. Add subcloud(s) to the peer group 5. Create peer group association to associate system peer and subcloud peer group - Site A 6. Check current sync status on Sites A and B. Verify they are 'in-sync'. PASS: Verify 'out-of-sync' on both sites after running any of the following operations on site A, the primary and leader site: 1. Adding subcloud to the SPG. 2. Removing subcloud from the SPG. 3. Updating any field of a subcloud in the SPG, such as bootstrap address, bootstrap values, install values, etc. PASS: Repeat the above operations while site B is down and verify that PGA sync_status is set to "failed". PASS: Verify that SPG attribute updates are accepted if peer site is up and the updates are successfully synced to the peer site. PASS: Verify that SPG attribute updates are rejected if the peer site is down. PASS: Verify that if the subcloud does not belong to any peer group, or if it is part of a peer group but the peer group is not associated with any peer yet, updating the subcloud would not result in an "informative" message and no attempt to update the PGA sync_status should occur. Closes-Bug: 2054123 Closes-Bug: 2054124 Change-Id: I9f0e44e34c7db5d60d211b70e839606d0361cf83 Signed-off-by: lzhu1 <li.zhu@windriver.com>
379 lines
17 KiB
Python
379 lines
17 KiB
Python
#
|
|
# Copyright (c) 2023-2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
|
|
from collections import defaultdict
|
|
import threading
|
|
|
|
from fm_api import constants as fm_const
|
|
from fm_api import fm_api
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
|
|
from dcmanager.common import consts
|
|
from dcmanager.common import context
|
|
from dcmanager.common import manager
|
|
from dcmanager.db import api as db_api
|
|
from dcmanager.manager import peer_group_audit_manager as pgam
|
|
from dcmanager.manager.system_peer_manager import SystemPeerManager
|
|
|
|
|
|
CONF = cfg.CONF
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
|
|
class PeerMonitor(object):
|
|
def __init__(self, peer, context, subcloud_manager):
|
|
self.peer = peer
|
|
self.thread = None
|
|
self.exit_flag = threading.Event()
|
|
self.fm_api = fm_api.FaultAPIs()
|
|
self.context = context
|
|
self.subcloud_manager = subcloud_manager
|
|
self.peer_group_id_set = set()
|
|
# key: peer_group_id
|
|
# value: PeerGroupAuditManager object
|
|
self.peer_group_audit_obj_map = dict()
|
|
|
|
def _clear_failure(self):
|
|
alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED
|
|
entity_instance_id = "peer=%s" % self.peer.peer_uuid
|
|
try:
|
|
fault = self.fm_api.get_fault(alarm_id, entity_instance_id)
|
|
if fault:
|
|
self.fm_api.clear_fault(alarm_id, entity_instance_id)
|
|
except Exception as e:
|
|
LOG.exception(
|
|
"Problem clearing fault for peer %s, alarm_id=%s "
|
|
"error: %s" % (self.peer.peer_uuid, alarm_id, e))
|
|
|
|
def _raise_failure(self):
|
|
alarm_id = fm_const.FM_ALARM_ID_DC_SYSTEM_PEER_HEARTBEAT_FAILED
|
|
entity_instance_id = "peer=%s" % self.peer.peer_uuid
|
|
reason_text = ("Peer %s (peer_uuid=%s) connections in "
|
|
"disconnected state." % (self.peer.peer_name,
|
|
self.peer.peer_uuid))
|
|
severity = fm_const.FM_ALARM_SEVERITY_MAJOR
|
|
|
|
peer_groups = db_api.subcloud_peer_group_get_by_leader_id(
|
|
self.context, self.peer.peer_uuid)
|
|
if len(peer_groups) > 0:
|
|
peer_group_names = [peer_group.peer_group_name
|
|
for peer_group in peer_groups]
|
|
reason_text = ("Peer %s (peer_uuid=%s) is in disconnected "
|
|
"state. The following subcloud peer groups "
|
|
"are impacted: %s." %
|
|
(self.peer.peer_name, self.peer.peer_uuid,
|
|
", ".join(peer_group_names)))
|
|
severity = fm_const.FM_ALARM_SEVERITY_CRITICAL
|
|
|
|
try:
|
|
fault = fm_api.Fault(
|
|
alarm_id=alarm_id,
|
|
alarm_state=fm_const.FM_ALARM_STATE_SET,
|
|
entity_type_id=fm_const.FM_ENTITY_TYPE_SYSTEM_PEER,
|
|
entity_instance_id=entity_instance_id,
|
|
severity=severity,
|
|
reason_text=reason_text,
|
|
alarm_type=fm_const.FM_ALARM_TYPE_1,
|
|
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_UNKNOWN,
|
|
proposed_repair_action="Check the connectivity between "
|
|
"the current system and the reported peer site. If the "
|
|
"peer system is down, migrate the affected peer group(s) "
|
|
"to the current system for continued subcloud management.",
|
|
service_affecting=False)
|
|
|
|
self.fm_api.set_fault(fault)
|
|
|
|
except Exception as e:
|
|
LOG.exception(
|
|
"Problem setting fault for peer %s, alarm_id=%s, "
|
|
"error: %s" % (self.peer.peer_uuid, alarm_id, e))
|
|
|
|
def _heartbeat_check_via_get_peer_group_list(self):
|
|
"""Checking the heartbeat of system peer."""
|
|
failed = True
|
|
dc_peer_subcloud_peer_group_list = list()
|
|
try:
|
|
dc_client = SystemPeerManager.get_peer_dc_client(self.peer)
|
|
dc_peer_subcloud_peer_group_list = \
|
|
dc_client.get_subcloud_peer_group_list()
|
|
failed = False
|
|
|
|
if not dc_peer_subcloud_peer_group_list:
|
|
LOG.warning("Resource subcloud peer group of dc:%s "
|
|
"not found" % self.peer.manager_endpoint)
|
|
|
|
except Exception:
|
|
LOG.exception("Failed to access the dc: %s" %
|
|
self.peer.peer_name)
|
|
return failed, dc_peer_subcloud_peer_group_list
|
|
|
|
def _update_sync_status_when_secondary_site_becomes_unreachable(self):
|
|
# Get associations by system peer
|
|
associations = SystemPeerManager.get_local_associations(self.context,
|
|
self.peer)
|
|
for association in associations:
|
|
# If the association is not primary, skip it.
|
|
if association.association_type == consts.\
|
|
ASSOCIATION_TYPE_NON_PRIMARY:
|
|
LOG.debug("Skip update the Association sync_status as "
|
|
"it is not primary.")
|
|
continue
|
|
# If the secondary site is down, set the association sync status
|
|
# "in-sync" -> "unknown"
|
|
# "unknown" -> "unknown"
|
|
# "out-of-sync" -> "failed"
|
|
# "syncing" -> "failed"
|
|
# "failed" -> "failed"
|
|
sync_status = consts.ASSOCIATION_SYNC_STATUS_UNKNOWN
|
|
message = f"Peer site ({self.peer.peer_name}) is unreachable."
|
|
if association.sync_status not in [
|
|
consts.ASSOCIATION_SYNC_STATUS_IN_SYNC,
|
|
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN]:
|
|
sync_status = consts.ASSOCIATION_SYNC_STATUS_FAILED
|
|
db_api.peer_group_association_update(
|
|
self.context, association.id,
|
|
sync_status=sync_status,
|
|
sync_message=message)
|
|
|
|
def _update_sync_status_when_secondary_site_becomes_reachable(self):
|
|
# Get associations by system peer
|
|
associations = SystemPeerManager.get_local_associations(self.context,
|
|
self.peer)
|
|
for association in associations:
|
|
# If the association is not primary, skip it.
|
|
if association.association_type == consts.\
|
|
ASSOCIATION_TYPE_NON_PRIMARY:
|
|
LOG.debug("Skip update Peer Site Association sync_status as "
|
|
"current site Association is not primary.")
|
|
continue
|
|
# Upon detecting that the secondary site is reachable again,
|
|
# the PGA sync_status will be set for both sites by the primary
|
|
# site monitor thread as follows:
|
|
# "unknown" -> "in-sync"
|
|
# "failed" -> "out-of-sync"
|
|
sync_status = consts.ASSOCIATION_SYNC_STATUS_OUT_OF_SYNC
|
|
if association.sync_status == \
|
|
consts.ASSOCIATION_SYNC_STATUS_UNKNOWN:
|
|
sync_status = consts.ASSOCIATION_SYNC_STATUS_IN_SYNC
|
|
dc_local_pg = db_api.subcloud_peer_group_get(
|
|
self.context, association.peer_group_id)
|
|
SystemPeerManager.update_sync_status(
|
|
self.context, self.peer, sync_status, dc_local_pg,
|
|
association=association)
|
|
|
|
def _do_monitor_peer(self):
|
|
failure_count = 0
|
|
LOG.info("Start monitoring thread for peer %s" %
|
|
self.peer.peer_name)
|
|
# Do the actual peer monitor.
|
|
while not self.exit_flag.wait(timeout=self.peer.heartbeat_interval):
|
|
try:
|
|
# Get system peer from DB
|
|
self.peer = db_api.system_peer_get(self.context, self.peer.id)
|
|
failed, remote_pg_list = \
|
|
self._heartbeat_check_via_get_peer_group_list()
|
|
if failed:
|
|
failure_count += 1
|
|
if failure_count >= self.peer.heartbeat_failure_threshold:
|
|
# heartbeat_failure_threshold reached.
|
|
LOG.warning("DC %s heartbeat failed, Raising alarm" %
|
|
self.peer.peer_name)
|
|
self._raise_failure()
|
|
db_api.system_peer_update(
|
|
self.context, self.peer.id,
|
|
availability_state= # noqa: E251
|
|
consts.SYSTEM_PEER_AVAILABILITY_STATE_UNAVAILABLE
|
|
)
|
|
# pylint: disable=line-too-long
|
|
self._update_sync_status_when_secondary_site_becomes_unreachable() # noqa: E501
|
|
failure_count = 0
|
|
self._set_require_audit_flag_to_associated_peer_groups()
|
|
else:
|
|
failure_count = 0
|
|
self._audit_local_peer_groups(remote_pg_list)
|
|
if self.peer.availability_state != \
|
|
consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE:
|
|
db_api.system_peer_update(
|
|
self.context, self.peer.id,
|
|
availability_state= # noqa: E251
|
|
consts.SYSTEM_PEER_AVAILABILITY_STATE_AVAILABLE
|
|
)
|
|
# pylint: disable=line-too-long
|
|
self._update_sync_status_when_secondary_site_becomes_reachable() # noqa: E501
|
|
LOG.info("DC %s back online, clear alarm" %
|
|
self.peer.peer_name)
|
|
self._clear_failure()
|
|
except Exception as e:
|
|
LOG.exception("Got exception monitoring peer %s error: %s" %
|
|
(self.peer.peer_name, e))
|
|
LOG.info("Caught graceful exit signal for peer monitor %s" %
|
|
self.peer.peer_name)
|
|
|
|
def _audit_local_peer_groups(self, remote_pg_list):
|
|
# Generate a dict index by remote peer group name
|
|
remote_pg_dict = {
|
|
remote_peer_group.get("peer_group_name"): remote_peer_group
|
|
for remote_peer_group in remote_pg_list
|
|
}
|
|
|
|
# Only audit peer groups existing on both side
|
|
for peer_group_id, pgam_obj in self.peer_group_audit_obj_map.items():
|
|
peer_group = db_api.subcloud_peer_group_get(self.context,
|
|
peer_group_id)
|
|
if peer_group.peer_group_name in remote_pg_dict:
|
|
remote_peer_group = remote_pg_dict[peer_group.peer_group_name]
|
|
# Audit for require_audit_flag is True or
|
|
# Remote peer group is in 'complete' state.
|
|
if (pgam_obj.require_audit_flag
|
|
or remote_peer_group.get("migration_status") ==
|
|
consts.PEER_GROUP_MIGRATION_COMPLETE):
|
|
pgam_obj.audit_peer_group_from_system(
|
|
self.peer, remote_peer_group, peer_group)
|
|
else:
|
|
LOG.warning("peer group %s not found on remote DC %s "
|
|
"nothing to audit, need sync operation" %
|
|
(peer_group.peer_group_name, self.peer.peer_name))
|
|
|
|
def _set_require_audit_flag_to_associated_peer_groups(self):
|
|
for pgam_obj in self.peer_group_audit_obj_map.values():
|
|
pgam_obj.require_audit_flag = True
|
|
|
|
def audit_specific_local_peer_group(self, peer_group, remote_peer_group):
|
|
msg = None
|
|
if peer_group.id in self.peer_group_audit_obj_map:
|
|
pgam_obj = self.peer_group_audit_obj_map[peer_group.id]
|
|
pgam_obj.audit(self.peer, remote_peer_group, peer_group)
|
|
else:
|
|
msg = ("No peer group id %s found" % peer_group.peer_group_name)
|
|
return msg
|
|
|
|
def _clean_peer_group_audit_threads(self):
|
|
for pgam_obj in self.peer_group_audit_obj_map.values():
|
|
pgam_obj.stop()
|
|
self.peer_group_audit_obj_map.clear()
|
|
|
|
def update_peer_group_id_set(self, peer_group_id_set):
|
|
removed_peer_groups = self.peer_group_id_set - peer_group_id_set
|
|
new_peer_groups = peer_group_id_set - self.peer_group_id_set
|
|
|
|
# destroy removed peer_group audit object
|
|
for peer_group_id in removed_peer_groups:
|
|
LOG.info("Peer group [%s] removed from peer [%s]" %
|
|
(peer_group_id, self.peer.peer_name))
|
|
if peer_group_id in self.peer_group_audit_obj_map:
|
|
self.peer_group_audit_obj_map[peer_group_id].stop()
|
|
del self.peer_group_audit_obj_map[peer_group_id]
|
|
# Add new peer_group audit object
|
|
for peer_group_id in new_peer_groups:
|
|
LOG.info("New peer group [%s] found for peer [%s]" %
|
|
(peer_group_id, self.peer.peer_name))
|
|
self.peer_group_audit_obj_map[peer_group_id] = \
|
|
pgam.PeerGroupAuditManager(self.subcloud_manager,
|
|
peer_group_id)
|
|
self.peer_group_id_set = peer_group_id_set
|
|
self._set_require_audit_flag_to_associated_peer_groups()
|
|
|
|
def start(self):
|
|
if self.thread is not None:
|
|
LOG.error('Peer monitor thread for %s has already started' %
|
|
self.peer.peer_name)
|
|
else:
|
|
self.thread = threading.Thread(target=self._do_monitor_peer)
|
|
self.thread.start()
|
|
|
|
def stop(self):
|
|
self.exit_flag.set()
|
|
self.thread.join()
|
|
self._clear_failure()
|
|
self._clean_peer_group_audit_threads()
|
|
|
|
|
|
class PeerMonitorManager(manager.Manager):
|
|
"""Manages tasks related to peer monitor."""
|
|
|
|
def __init__(self, subcloud_manager):
|
|
LOG.debug('PeerMonitorManager initialization...')
|
|
|
|
super(PeerMonitorManager, self).__init__(
|
|
service_name="peer_monitor_manager")
|
|
self.peer_monitor = dict()
|
|
self.context = context.get_admin_context()
|
|
self.subcloud_manager = subcloud_manager
|
|
|
|
# key: system_peer_id
|
|
# value: PeerMonitor object
|
|
self.peer_monitor_thread_map = dict()
|
|
|
|
def _remove_peer_monitor_task(self, system_peer_id):
|
|
peer_mon_obj = self.peer_monitor_thread_map[system_peer_id]
|
|
peer_mon_obj.stop()
|
|
del self.peer_monitor_thread_map[system_peer_id]
|
|
|
|
def _create_peer_monitor_task(self, system_peer_id):
|
|
peer = db_api.system_peer_get(self.context,
|
|
system_peer_id)
|
|
LOG.info("Create monitoring thread for peer: %s" %
|
|
peer.peer_name)
|
|
self.peer_monitor_thread_map[system_peer_id] = PeerMonitor(
|
|
peer, self.context, self.subcloud_manager)
|
|
self.peer_monitor_thread_map[system_peer_id].start()
|
|
|
|
@staticmethod
|
|
def _diff_dict(dict1, dict2):
|
|
return {key: value for key, value in dict1.items() if key not in dict2}
|
|
|
|
def _create_or_destroy_peer_monitor_task(self, peer_system_peer_group_map):
|
|
new_peers = self._diff_dict(peer_system_peer_group_map,
|
|
self.peer_monitor_thread_map)
|
|
removed_peers = self._diff_dict(self.peer_monitor_thread_map,
|
|
peer_system_peer_group_map)
|
|
for peer_id in new_peers:
|
|
self._create_peer_monitor_task(peer_id)
|
|
for peer_id in removed_peers:
|
|
self._remove_peer_monitor_task(peer_id)
|
|
|
|
# Update peer_group_id set
|
|
for peer_id, pm_obj in self.peer_monitor_thread_map.items():
|
|
pm_obj.update_peer_group_id_set(
|
|
peer_system_peer_group_map[peer_id])
|
|
|
|
def peer_monitor_notify(self, context):
|
|
LOG.info("Caught peer monitor notify...")
|
|
peer_system_peer_group_map = defaultdict(set)
|
|
# Get local associations
|
|
associations = db_api.peer_group_association_get_all(context)
|
|
for association in associations:
|
|
peer_system_peer_group_map[association.system_peer_id].add(
|
|
association.peer_group_id)
|
|
|
|
self._create_or_destroy_peer_monitor_task(peer_system_peer_group_map)
|
|
|
|
def peer_group_audit_notify(self, context, peer_group_name, payload):
|
|
LOG.info("Caught peer group audit notification for peer group %s" %
|
|
peer_group_name)
|
|
msg = None
|
|
try:
|
|
peer_group = db_api.subcloud_peer_group_get_by_name(
|
|
context, peer_group_name)
|
|
system_uuid = payload.get('peer_uuid')
|
|
system_peer = db_api.system_peer_get_by_uuid(context,
|
|
system_uuid)
|
|
if system_peer.id in self.peer_monitor_thread_map:
|
|
pmobj = self.peer_monitor_thread_map[system_peer.id]
|
|
msg = pmobj.audit_specific_local_peer_group(peer_group,
|
|
payload)
|
|
else:
|
|
msg = ("System peer with UUID=%s is not under monitoring. "
|
|
"Skipping audit for peer group %s" %
|
|
(system_uuid, peer_group_name))
|
|
LOG.warning(msg)
|
|
return msg
|
|
except Exception as e:
|
|
LOG.exception('Handling peer group audit notify error: %s' %
|
|
str(e))
|
|
return str(e)
|