Optimize subcloud state manager's queries

This commit optimizes subcloud state manager's queries used to perform a
bulk update for a subcloud's availability and endpoint status, which
were being used in [1].
Previously, each database query was made separately for every update
for either the availability status and/or the endpoint(s) during the
audit process, which resulted in duplicated state and database calls.
In [1] the RPC state calls were significantly reduced by creating a
single request for each subcloud and. Consequently, all queries that
were made in the database in separate steps of the process started to be
executed at once, resulting in approximately 23 [2] queries per subcloud
for a complete audit. With this commit, the maximum number of database
transactions is reduced from 23 to 3.

Test plan:
1. PASS: Unmanage a subcloud and verify that all of its endpoints' sync
   status become unknown.
2. PASS: Manage a subcloud and verify that all of its endpoints' sync
   status become in-sync
3. PASS: Apply a patch in the system controller and verify that all of
   the subclouds' patching sync status becomes out-of-sync.
4. PASS: Apply the patch in the subclouds and verify that their patching
   sync status becomes in-sync.
5. PASS: Verify that the hourly unconditional update for the subcloud's
   availability status updates the database

[1] https://review.opendev.org/c/starlingx/distcloud/+/922058
[2] Analysis of the number of requests considering dcmanager's audit
Subcloud becoming online:
- subcloud_get_by_region_name: 1
- fm's get_fault: 1
- fm's set_fault or clear_fault: 1
- subcloud_update: 1
- for each endpoint audited by dcmanager (7):
    - subcloud_get_by_region_name: 1 (removed)
    - subcloud_get_with_status: 1 (removed)
    - subcloud_endpoint_status_db_model_to_dict: 9 (removed)
    - subcloud_status_update: 1 (changed to one query for all endpoints)
    - fm's get_fault: 1
    - fm's set_fault or clear_fault: 1
Total:
    - 39 queries that are now 19, considering fm's database.
    - 23 queries that are now 3 in dcmanager's database

Note that the totals does not include the db_model_to_dict request
because it does not query the database.

Subcloud becoming offline:
- subcloud_get_by_region_name: 2 reduced to 1
- fm's get_fault: 10 (one for each endpoint and the availability)
- fm's set_fault or clear_fault: 10
- subcloud_get_with_status: 1 (removed)
- subcloud_endpoint_status_db_model_to_dict: 9 (removed)
- subcloud_status_update_endpoints: 1
- subcloud_update: 1

Story: 2011106
Task: 50433

Change-Id: I34b8604bf445cc0ebdc02c5959a919221e62de5a
Signed-off-by: Raphael Lima <Raphael.Lima@windriver.com>
This commit is contained in:
Raphael Lima
2024-06-21 17:57:15 -03:00
parent 7cfb86f0cc
commit 8797888d14
8 changed files with 428 additions and 157 deletions

View File

@@ -321,16 +321,8 @@ class SubcloudAuditWorkerManager(manager.Manager):
subcloud_management_ip = subcloud.management_start_ip subcloud_management_ip = subcloud.management_start_ip
audits_done = list() audits_done = list()
failures = list() failures = list()
availability_data = dict()
batch_request_data = { endpoint_data = dict()
"availability": None,
dccommon_consts.ENDPOINT_TYPE_PATCHING: None,
dccommon_consts.ENDPOINT_TYPE_LOAD: None,
dccommon_consts.ENDPOINT_TYPE_FIRMWARE: None,
dccommon_consts.ENDPOINT_TYPE_KUBERNETES: None,
dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA: None,
dccommon_consts.ENDPOINT_TYPE_SOFTWARE: None
}
# Set defaults to None and disabled so we will still set disabled # Set defaults to None and disabled so we will still set disabled
# status if we encounter an error. # status if we encounter an error.
@@ -427,12 +419,10 @@ class SubcloudAuditWorkerManager(manager.Manager):
LOG.debug('Setting new availability status: %s ' LOG.debug('Setting new availability status: %s '
'on subcloud: %s' % 'on subcloud: %s' %
(avail_to_set, subcloud_name)) (avail_to_set, subcloud_name))
batch_request_data.update({ availability_data.update({
"availability": { "availability_status": avail_to_set,
"availability_status": avail_to_set, "update_state_only": False,
"update_state_only": False, "audit_fail_count": audit_fail_count
"audit_fail_count": audit_fail_count
}
}) })
elif audit_fail_count != subcloud.audit_fail_count: elif audit_fail_count != subcloud.audit_fail_count:
@@ -448,12 +438,10 @@ class SubcloudAuditWorkerManager(manager.Manager):
# subcloud as an audit. # subcloud as an audit.
LOG.debug('Updating subcloud state unconditionally for subcloud %s' LOG.debug('Updating subcloud state unconditionally for subcloud %s'
% subcloud_name) % subcloud_name)
batch_request_data.update({ availability_data.update({
"availability": { "availability_status": avail_status_current,
"availability_status": avail_status_current, "update_state_only": True,
"update_state_only": True, "audit_fail_count": None
"audit_fail_count": None
}
}) })
# If subcloud is managed and online and the identity was synced once, # If subcloud is managed and online and the identity was synced once,
@@ -470,7 +458,7 @@ class SubcloudAuditWorkerManager(manager.Manager):
# If we have patch audit data, audit the subcloud # If we have patch audit data, audit the subcloud
if do_patch_audit and patch_audit_data: if do_patch_audit and patch_audit_data:
try: try:
batch_request_data[dccommon_consts.ENDPOINT_TYPE_PATCHING] = ( endpoint_data[dccommon_consts.ENDPOINT_TYPE_PATCHING] = (
self.patch_audit.subcloud_patch_audit( self.patch_audit.subcloud_patch_audit(
keystone_client.session, sysinv_client, keystone_client.session, sysinv_client,
subcloud_management_ip, subcloud_name, subcloud_region, subcloud_management_ip, subcloud_name, subcloud_region,
@@ -484,7 +472,7 @@ class SubcloudAuditWorkerManager(manager.Manager):
# Perform load audit # Perform load audit
if do_load_audit and patch_audit_data: if do_load_audit and patch_audit_data:
try: try:
batch_request_data[dccommon_consts.ENDPOINT_TYPE_LOAD] = ( endpoint_data[dccommon_consts.ENDPOINT_TYPE_LOAD] = (
self.patch_audit.subcloud_load_audit( self.patch_audit.subcloud_load_audit(
sysinv_client, subcloud_name, patch_audit_data sysinv_client, subcloud_name, patch_audit_data
) )
@@ -496,7 +484,7 @@ class SubcloudAuditWorkerManager(manager.Manager):
# Perform firmware audit # Perform firmware audit
if do_firmware_audit: if do_firmware_audit:
try: try:
batch_request_data[dccommon_consts.ENDPOINT_TYPE_FIRMWARE] = ( endpoint_data[dccommon_consts.ENDPOINT_TYPE_FIRMWARE] = (
self.firmware_audit.subcloud_firmware_audit( self.firmware_audit.subcloud_firmware_audit(
sysinv_client, subcloud_name, firmware_audit_data sysinv_client, subcloud_name, firmware_audit_data
) )
@@ -508,7 +496,7 @@ class SubcloudAuditWorkerManager(manager.Manager):
# Perform kubernetes audit # Perform kubernetes audit
if do_kubernetes_audit: if do_kubernetes_audit:
try: try:
batch_request_data[dccommon_consts.ENDPOINT_TYPE_KUBERNETES] = ( endpoint_data[dccommon_consts.ENDPOINT_TYPE_KUBERNETES] = (
self.kubernetes_audit.subcloud_kubernetes_audit( self.kubernetes_audit.subcloud_kubernetes_audit(
sysinv_client, subcloud_name, kubernetes_audit_data sysinv_client, subcloud_name, kubernetes_audit_data
) )
@@ -520,7 +508,7 @@ class SubcloudAuditWorkerManager(manager.Manager):
# Perform kube rootca update audit # Perform kube rootca update audit
if do_kube_rootca_update_audit: if do_kube_rootca_update_audit:
try: try:
batch_request_data[dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA] = ( endpoint_data[dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA] = (
self.kube_rootca_update_audit.subcloud_kube_rootca_audit( self.kube_rootca_update_audit.subcloud_kube_rootca_audit(
sysinv_client, fm_client, subcloud, sysinv_client, fm_client, subcloud,
kube_rootca_update_audit_data kube_rootca_update_audit_data
@@ -545,7 +533,7 @@ class SubcloudAuditWorkerManager(manager.Manager):
# Perform software audit # Perform software audit
if do_software_audit: if do_software_audit:
try: try:
batch_request_data[dccommon_consts.ENDPOINT_TYPE_SOFTWARE] = ( endpoint_data[dccommon_consts.ENDPOINT_TYPE_SOFTWARE] = (
self.software_audit.subcloud_software_audit( self.software_audit.subcloud_software_audit(
keystone_client, subcloud_management_ip, keystone_client, subcloud_management_ip,
subcloud_name, subcloud_region, software_audit_data subcloud_name, subcloud_region, software_audit_data
@@ -556,19 +544,23 @@ class SubcloudAuditWorkerManager(manager.Manager):
LOG.exception(failmsg % (subcloud.name, 'software')) LOG.exception(failmsg % (subcloud.name, 'software'))
failures.append('software') failures.append('software')
if any(batch_request_data.values()): if availability_data or (endpoint_data and any(endpoint_data.values())):
# If a value is not None, an update should be sent to dcmanager-state # If a value is not None, an update should be sent to the rpc client
try: try:
self.state_rpc_client.\ self.state_rpc_client.\
batch_update_subcloud_availability_and_endpoint_status( bulk_update_subcloud_availability_and_endpoint_status(
self.context, subcloud_name, subcloud_region, self.context, subcloud_name, subcloud_region,
batch_request_data availability_data, endpoint_data
) )
LOG.info('Notifying dcmanager-state, subcloud:%s, batch ' LOG.debug(
'availability and endpoint status update' % subcloud_name) f'Notifying dcmanager-state, subcloud: {subcloud_name}, bulk '
'availability and endpoint status update'
)
except Exception: except Exception:
LOG.exception('Failed to notify dcmanager-state of subcloud ' LOG.exception(
'batch availability and endpoint status update, ' 'Failed to notify dcmanager-state of subcloud batch '
'subcloud: %s' % subcloud_name) 'availability and endpoint status update, '
f'subcloud: {subcloud_name}'
)
return audits_done, failures return audits_done, failures

View File

@@ -296,6 +296,14 @@ def subcloud_status_update_endpoints(context, subcloud_id,
endpoint_type_list, sync_status) endpoint_type_list, sync_status)
def subcloud_status_bulk_update_endpoints(context, subcloud_id, endpoint_list):
"""Update the status of the specified endpoints for a subcloud"""
return IMPL.subcloud_status_bulk_update_endpoints(
context, subcloud_id, endpoint_list
)
def subcloud_status_destroy_all(context, subcloud_id): def subcloud_status_destroy_all(context, subcloud_id):
"""Destroy all the statuses for a subcloud """Destroy all the statuses for a subcloud

View File

@@ -30,12 +30,14 @@ from oslo_log import log as logging
from oslo_utils import strutils from oslo_utils import strutils
from oslo_utils import uuidutils from oslo_utils import uuidutils
import sqlalchemy import sqlalchemy
from sqlalchemy import bindparam
from sqlalchemy import desc from sqlalchemy import desc
from sqlalchemy import or_ from sqlalchemy import or_
from sqlalchemy.orm import exc from sqlalchemy.orm import exc
from sqlalchemy.orm import joinedload_all from sqlalchemy.orm import joinedload_all
from sqlalchemy.orm import load_only from sqlalchemy.orm import load_only
from sqlalchemy.sql.expression import true from sqlalchemy.sql.expression import true
from sqlalchemy import update
from dccommon import consts as dccommon_consts from dccommon import consts as dccommon_consts
from dcmanager.common import consts from dcmanager.common import consts
@@ -551,6 +553,14 @@ def subcloud_status_get_all(context, subcloud_id):
filter(models.Subcloud.id == subcloud_id).all() filter(models.Subcloud.id == subcloud_id).all()
@require_context
def _subcloud_status_get_by_endpoint_types(context, subcloud_id, endpoint_types):
return model_query(context, models.SubcloudStatus). \
filter_by(deleted=0). \
filter(models.SubcloudStatus.subcloud_id == subcloud_id).\
filter(models.SubcloudStatus.endpoint_type.in_(endpoint_types)).all()
@require_context @require_context
def subcloud_status_get_all_by_name(context, name): def subcloud_status_get_all_by_name(context, name):
return model_query(context, models.SubcloudStatus). \ return model_query(context, models.SubcloudStatus). \
@@ -620,6 +630,46 @@ def subcloud_status_update_endpoints(context, subcloud_id,
return result return result
@require_admin_context
def subcloud_status_bulk_update_endpoints(context, subcloud_id, endpoint_list):
"""Update the status of the specified endpoints for a subcloud
Will raise if subcloud status does not exist.
"""
# Retrieves the subcloud status' data for all of the endpoints in endpoint_lst
subcloud_statuses = _subcloud_status_get_by_endpoint_types(
context, subcloud_id, endpoint_list.keys()
)
# Create a list with the id of each subcloud status that needs to be updated and
# its respective sync_status
update_list = list()
for subcloud_status in subcloud_statuses:
update_list.append({
"_id": subcloud_status.id,
"sync_status": endpoint_list[subcloud_status.endpoint_type]
})
# Bindparam associates keys from update_list to columns in the database
# query. This way, for each of the items that needs update, it's possible to
# set a specific sync_status, i.e. the query is capable of updating many
# endpoints with each of them having one of three values:
# in-sync, out-of-sync and unknown.
with write_session() as session:
statement = update(models.SubcloudStatus).\
where(models.SubcloudStatus.id == bindparam("_id")).\
values(sync_status=bindparam("sync_status"))
result = session.execute(statement, update_list)
if not result:
raise exception.SubcloudStatusNotFound(
subcloud_id=subcloud_id, endpoint_type="any"
)
return result
@require_admin_context @require_admin_context
def subcloud_status_destroy_all(context, subcloud_id): def subcloud_status_destroy_all(context, subcloud_id):
with write_session() as session: with write_session() as session:

View File

@@ -69,15 +69,17 @@ class SubcloudStateClient(RPCClient):
consts.TOPIC_DC_MANAGER_STATE, consts.TOPIC_DC_MANAGER_STATE,
self.BASE_RPC_API_VERSION) self.BASE_RPC_API_VERSION)
def batch_update_subcloud_availability_and_endpoint_status( def bulk_update_subcloud_availability_and_endpoint_status(
self, ctxt, subcloud_name, subcloud_region, availability_and_endpoint_data self, ctxt, subcloud_name, subcloud_region, availability_data,
endpoint_data
): ):
# Note: This is an asynchronous operation. # Note: This is an asynchronous operation.
return self.cast(ctxt, self.make_msg( return self.cast(ctxt, self.make_msg(
'batch_update_subcloud_availability_and_endpoint_status', 'bulk_update_subcloud_availability_and_endpoint_status',
subcloud_name=subcloud_name, subcloud_name=subcloud_name,
subcloud_region=subcloud_region, subcloud_region=subcloud_region,
availability_and_endpoint_data=availability_and_endpoint_data) availability_data=availability_data,
endpoint_data=endpoint_data)
) )
def update_subcloud_availability(self, ctxt, def update_subcloud_availability(self, ctxt,

View File

@@ -191,16 +191,17 @@ class DCManagerStateService(service.Service):
audit_fail_count, audit_fail_count,
) )
def batch_update_subcloud_availability_and_endpoint_status( def bulk_update_subcloud_availability_and_endpoint_status(
self, context, subcloud_name, subcloud_region, availability_and_endpoint_data self, context, subcloud_name, subcloud_region, availability_data,
endpoint_data
): ):
LOG.info( LOG.info(
"Handling batch_update_subcloud_availability_and_endpoint_status request " "Handling bulk_update_subcloud_availability_and_endpoint_status request "
f"for subcloud: {subcloud_name}" f"for subcloud: {subcloud_name}"
) )
self.subcloud_state_manager.\ self.subcloud_state_manager.\
batch_update_subcloud_availability_and_endpoint_status( bulk_update_subcloud_availability_and_endpoint_status(
context, subcloud_name, subcloud_region, context, subcloud_name, subcloud_region, availability_data,
availability_and_endpoint_data endpoint_data
) )

View File

@@ -19,6 +19,7 @@
from fm_api import constants as fm_const from fm_api import constants as fm_const
from fm_api import fm_api from fm_api import fm_api
from oslo_concurrency import lockutils
from oslo_log import log as logging from oslo_log import log as logging
from dccommon import consts as dccommon_consts from dccommon import consts as dccommon_consts
@@ -34,13 +35,14 @@ from dcorch.rpc import client as dcorch_rpc_client
LOG = logging.getLogger(__name__) LOG = logging.getLogger(__name__)
ALARM_OUT_OF_SYNC = fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC ALARM_OUT_OF_SYNC = fm_const.FM_ALARM_ID_DC_SUBCLOUD_RESOURCE_OUT_OF_SYNC
LOCK_NAME = "dc-audit-bulk-update"
def sync_update_subcloud_endpoint_status(func): def sync_update_subcloud_endpoint_status(func):
"""Synchronized lock decorator for _update_subcloud_endpoint_status. """ """Synchronized lock decorator for _update_subcloud_endpoint_status."""
def _get_lock_and_call(*args, **kwargs): def _get_lock_and_call(*args, **kwargs):
"""Get a single fair lock per subcloud based on subcloud region. """ """Get a single fair lock per subcloud based on subcloud region."""
# subcloud region is the 3rd argument to # subcloud region is the 3rd argument to
# _update_subcloud_endpoint_status() # _update_subcloud_endpoint_status()
@@ -264,6 +266,43 @@ class SubcloudStateManager(manager.Manager):
else: else:
LOG.error("Subcloud not found:%s" % subcloud_id) LOG.error("Subcloud not found:%s" % subcloud_id)
def _should_update_endpoint_status(self, subcloud, endpoint_type, sync_status):
"""Verifies if the subcloud's endpoint should have its sync status updated"""
# Rules for updating sync status:
#
# For secondary subclouds, only update if the new sync_status is
# 'unknown'
#
# For others, always update if not in-sync.
#
# Otherwise, only update the sync status if managed and online
# (unless dc-cert).
#
# Most endpoints are audited only when the subcloud is managed and
# online. An exception is the dc-cert endpoint, which is audited
# whenever the subcloud is online (managed or unmanaged).
#
# This means if a subcloud is going offline or unmanaged, then
# the sync status update must be done first.
#
is_in_sync = sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC
is_online = subcloud.availability_status == \
dccommon_consts.AVAILABILITY_ONLINE
is_managed = subcloud.management_state == \
dccommon_consts.MANAGEMENT_MANAGED
is_endpoint_type_dc_cert = endpoint_type == \
dccommon_consts.ENDPOINT_TYPE_DC_CERT
is_secondary = subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY
is_sync_unknown = sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
is_secondary_and_sync_unknown = is_secondary and is_sync_unknown
return (
(not is_in_sync
or (is_online and (is_managed or is_endpoint_type_dc_cert)))
and not is_secondary
) or is_secondary_and_sync_unknown
@sync_update_subcloud_endpoint_status @sync_update_subcloud_endpoint_status
def _update_subcloud_endpoint_status( def _update_subcloud_endpoint_status(
self, context, self, context,
@@ -297,47 +336,13 @@ class SubcloudStateManager(manager.Manager):
LOG.exception(e) LOG.exception(e)
raise e raise e
# Rules for updating sync status: if self._should_update_endpoint_status(subcloud, endpoint_type, sync_status):
#
# For secondary subclouds, only update if the new sync_status is
# 'unknown'
#
# For others, always update if not in-sync.
#
# Otherwise, only update the sync status if managed and online
# (unless dc-cert).
#
# Most endpoints are audited only when the subcloud is managed and
# online. An exception is the dc-cert endpoint, which is audited
# whenever the subcloud is online (managed or unmanaged).
#
# This means if a subcloud is going offline or unmanaged, then
# the sync status update must be done first.
#
is_in_sync = sync_status == dccommon_consts.SYNC_STATUS_IN_SYNC
is_online = subcloud.availability_status == \
dccommon_consts.AVAILABILITY_ONLINE
is_managed = subcloud.management_state == \
dccommon_consts.MANAGEMENT_MANAGED
is_endpoint_type_dc_cert = endpoint_type == \
dccommon_consts.ENDPOINT_TYPE_DC_CERT
is_secondary = subcloud.deploy_status == consts.DEPLOY_STATE_SECONDARY
is_sync_unknown = sync_status == dccommon_consts.SYNC_STATUS_UNKNOWN
is_secondary_and_sync_unknown = is_secondary and is_sync_unknown
if (
(not is_in_sync
or (is_online and (is_managed or is_endpoint_type_dc_cert)))
and not is_secondary
) or is_secondary_and_sync_unknown:
# update a single subcloud # update a single subcloud
try: try:
self._do_update_subcloud_endpoint_status(context, self._do_update_subcloud_endpoint_status(
subcloud.id, context, subcloud.id, endpoint_type, sync_status,
endpoint_type, alarmable, ignore_endpoints
sync_status, )
alarmable,
ignore_endpoints)
except Exception as e: except Exception as e:
LOG.exception(e) LOG.exception(e)
raise e raise e
@@ -347,23 +352,134 @@ class SubcloudStateManager(manager.Manager):
(subcloud.name, subcloud.availability_status, (subcloud.name, subcloud.availability_status,
subcloud.management_state, endpoint_type, sync_status)) subcloud.management_state, endpoint_type, sync_status))
def batch_update_subcloud_availability_and_endpoint_status( def bulk_update_subcloud_availability_and_endpoint_status(
self, context, subcloud_name, subcloud_region, availability_and_endpoint_data self, context, subcloud_name, subcloud_region, availability_data,
endpoint_data
): ):
for key, value in availability_and_endpoint_data.items(): # This bulk update is executed as part of the audit process in dcmanager and
# If the value is None, that means nothing should be done for that key # its related endpoints. This method is not used by dcorch and cert-mon.
if value is None:
continue
if key == "availability": try:
self.update_subcloud_availability( subcloud = db_api.subcloud_get_by_region_name(context, subcloud_region)
context, subcloud_region, value["availability_status"], except Exception:
value["update_state_only"], value["audit_fail_count"] LOG.exception(
f"Failed to get subcloud by region name {subcloud_region}"
)
raise
if availability_data:
self.update_subcloud_availability(
context, subcloud_region, availability_data["availability_status"],
availability_data["update_state_only"],
availability_data["audit_fail_count"], subcloud
)
if endpoint_data:
self._bulk_update_subcloud_endpoint_status(
context, subcloud, endpoint_data
)
@lockutils.synchronized(LOCK_NAME)
def _do_bulk_update_subcloud_endpoint_status(
self, context, subcloud, endpoint_list
):
"""Updates an online and managed subcloud's endpoints sync status
:param context: request context object
:param subcloud: subcloud to update
:param endpoint_list: the list of endpoints and its sync status to update
"""
# This bulk update is executed as part of the audit process and, because of
# that, the logic is similar to _do_update_subcloud_endpoint_status but with
# the difference that only the required endpoints will be update and that'll
# happen at once.
LOG.info(
f"Updating endpoints on subcloud: {subcloud.name} "
f"endpoints: {', '.join(endpoint_list.keys())}"
)
for endpoint, sync_status in endpoint_list.items():
entity_instance_id = f"subcloud={subcloud.name}.resource={endpoint}"
fault = self.fm_api.get_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
# TODO(yuxing): batch clear all the out-of-sync alarms of a
# given subcloud if fm_api support it. Be careful with the
# dc-cert endpoint when adding the above; the endpoint
# alarm must remain for offline subclouds.
if (sync_status != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC) and fault:
try:
self.fm_api.clear_fault(ALARM_OUT_OF_SYNC, entity_instance_id)
except Exception as e:
LOG.exception(e)
elif not fault and \
(sync_status == dccommon_consts.SYNC_STATUS_OUT_OF_SYNC):
entity_type_id = fm_const.FM_ENTITY_TYPE_SUBCLOUD
try:
fault = fm_api.Fault(
alarm_id=ALARM_OUT_OF_SYNC,
alarm_state=fm_const.FM_ALARM_STATE_SET,
entity_type_id=entity_type_id,
entity_instance_id=entity_instance_id,
severity=fm_const.FM_ALARM_SEVERITY_MAJOR,
reason_text=("%s %s sync_status is "
"out-of-sync" %
(subcloud.name, endpoint)),
alarm_type=fm_const.FM_ALARM_TYPE_0,
probable_cause=fm_const.ALARM_PROBABLE_CAUSE_2,
proposed_repair_action="If problem persists "
"contact next level "
"of support",
service_affecting=False)
self.fm_api.set_fault(fault)
except Exception as e:
LOG.exception(e)
try:
db_api.subcloud_status_bulk_update_endpoints(
context, subcloud.id, endpoint_list,
)
except Exception as e:
LOG.exception(
f"An error occured when updating the subcloud {subcloud.name}'s"
f"endpoint status: {e}"
)
def _bulk_update_subcloud_endpoint_status(
self, context, subcloud, endpoint_list
):
"""Update the sync status of a list of subcloud endpoints
:param context: current context object
:param subcloud: subcloud object
:param endpoint_list: list of endpoints to update and their sync status
"""
endpoints_to_update = dict()
for endpoint_type, sync_status in endpoint_list.items():
if self._should_update_endpoint_status(
subcloud, endpoint_type, sync_status
):
endpoints_to_update.update({endpoint_type: sync_status})
# Update all the necessary endpoints for a single subcloud
if endpoints_to_update:
try:
self._do_bulk_update_subcloud_endpoint_status(
context, subcloud, endpoints_to_update
) )
continue except Exception as e:
self.update_subcloud_endpoint_status( LOG.exception(e)
context, subcloud_region=subcloud_region, endpoint_type=key, raise e
sync_status=value else:
LOG.info(
"Ignoring bulk_update_subcloud_endpoint_status for subcloud: "
f"{subcloud.name} availability: {subcloud.availability_status} "
f"management: {subcloud.management_state} endpoints: "
f"{', '.join(endpoint_list.keys())}"
) )
def update_subcloud_endpoint_status( def update_subcloud_endpoint_status(
@@ -461,14 +577,16 @@ class SubcloudStateManager(manager.Manager):
def update_subcloud_availability(self, context, subcloud_region, def update_subcloud_availability(self, context, subcloud_region,
availability_status, availability_status,
update_state_only=False, update_state_only=False,
audit_fail_count=None): audit_fail_count=None, subcloud=None):
try: if subcloud is None:
subcloud = db_api.subcloud_get_by_region_name(context, subcloud_region) try:
except Exception: subcloud = db_api.subcloud_get_by_region_name(context,
LOG.exception( subcloud_region)
"Failed to get subcloud by region name %s" % subcloud_region except Exception:
) LOG.exception(
raise "Failed to get subcloud by region name %s" % subcloud_region
)
raise
if update_state_only: if update_state_only:
# Ensure that the status alarm is consistent with the # Ensure that the status alarm is consistent with the
@@ -502,9 +620,14 @@ class SubcloudStateManager(manager.Manager):
if availability_status == dccommon_consts.AVAILABILITY_OFFLINE: if availability_status == dccommon_consts.AVAILABILITY_OFFLINE:
# Subcloud is going offline, set all endpoint statuses to # Subcloud is going offline, set all endpoint statuses to
# unknown. # unknown.
self._update_subcloud_endpoint_status( endpoint_list = dict()
context, subcloud.region_name, endpoint_type=None,
sync_status=dccommon_consts.SYNC_STATUS_UNKNOWN) for endpoint in dccommon_consts.ENDPOINT_TYPES_LIST:
endpoint_list[endpoint] = dccommon_consts.SYNC_STATUS_UNKNOWN
self._bulk_update_subcloud_endpoint_status(
context, subcloud, endpoint_list
)
try: try:
updated_subcloud = db_api.subcloud_update( updated_subcloud = db_api.subcloud_update(

View File

@@ -364,32 +364,33 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
self.mock_sysinv_client().get_applications.return_value = \ self.mock_sysinv_client().get_applications.return_value = \
FAKE_APPLICATIONS FAKE_APPLICATIONS
self.batch_state_request_data = { self.availability_data = dict()
"availability": None, self.endpoint_data = dict()
dccommon_consts.ENDPOINT_TYPE_PATCHING: None,
dccommon_consts.ENDPOINT_TYPE_LOAD: None,
dccommon_consts.ENDPOINT_TYPE_FIRMWARE: None,
dccommon_consts.ENDPOINT_TYPE_KUBERNETES: None,
dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA: None,
dccommon_consts.ENDPOINT_TYPE_SOFTWARE: None
}
def _update_availability( def _update_availability(
self, availability_status, update_status_only, audit_fail_count self, availability_status, update_status_only, audit_fail_count
): ):
self.batch_state_request_data.update({ self.availability_data.update({
"availability": { "availability_status": availability_status,
"availability_status": availability_status, "update_state_only": update_status_only,
"update_state_only": update_status_only, "audit_fail_count": audit_fail_count
"audit_fail_count": audit_fail_count
}
}) })
def _set_all_audits_in_sync(self): def _set_all_audits_in_sync(self):
for key in self.batch_state_request_data: self.endpoint_data.update({
if key != "availability": dccommon_consts.ENDPOINT_TYPE_PATCHING:
self.batch_state_request_data[key] = \ dccommon_consts.SYNC_STATUS_IN_SYNC,
dccommon_consts.SYNC_STATUS_IN_SYNC dccommon_consts.ENDPOINT_TYPE_LOAD:
dccommon_consts.SYNC_STATUS_IN_SYNC,
dccommon_consts.ENDPOINT_TYPE_FIRMWARE:
dccommon_consts.SYNC_STATUS_IN_SYNC,
dccommon_consts.ENDPOINT_TYPE_KUBERNETES:
dccommon_consts.SYNC_STATUS_IN_SYNC,
dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA:
dccommon_consts.SYNC_STATUS_IN_SYNC,
dccommon_consts.ENDPOINT_TYPE_SOFTWARE:
dccommon_consts.SYNC_STATUS_IN_SYNC
})
@staticmethod @staticmethod
def create_subcloud_static(ctxt, **kwargs): def create_subcloud_static(ctxt, **kwargs):
@@ -477,10 +478,10 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0) self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0)
self._set_all_audits_in_sync() self._set_all_audits_in_sync()
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_called_once_with( assert_called_once_with(
mock.ANY, subcloud.name, subcloud.region_name, mock.ANY, subcloud.name, subcloud.region_name,
self.batch_state_request_data self.availability_data, self.endpoint_data
) )
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called
@@ -576,10 +577,10 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud was set to online # Verify the subcloud was set to online
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0) self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0)
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_called_with( assert_called_with(
mock.ANY, subcloud.name, subcloud.region_name, mock.ANY, subcloud.name, subcloud.region_name,
self.batch_state_request_data self.availability_data, self.endpoint_data
) )
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called
@@ -652,10 +653,10 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud was set to online # Verify the subcloud was set to online
self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0) self._update_availability(dccommon_consts.AVAILABILITY_ONLINE, False, 0)
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_called_with( assert_called_with(
mock.ANY, subcloud.name, subcloud.region_name, mock.ANY, subcloud.name, subcloud.region_name,
self.batch_state_request_data self.availability_data, self.endpoint_data
) )
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called
@@ -712,7 +713,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud state was not updated # Verify the subcloud state was not updated
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_not_called() assert_not_called()
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called
@@ -762,10 +763,10 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
dccommon_consts.AVAILABILITY_ONLINE, True, None dccommon_consts.AVAILABILITY_ONLINE, True, None
) )
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_called_with( assert_called_with(
mock.ANY, subcloud.name, subcloud.region_name, mock.ANY, subcloud.name, subcloud.region_name,
self.batch_state_request_data self.availability_data, self.endpoint_data
) )
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called
@@ -880,10 +881,10 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the state was called only for the audits # Verify the state was called only for the audits
self._set_all_audits_in_sync() self._set_all_audits_in_sync()
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_called_once_with( assert_called_once_with(
mock.ANY, subcloud.name, subcloud.region_name, mock.ANY, subcloud.name, subcloud.region_name,
self.batch_state_request_data self.availability_data, self.endpoint_data
) )
# Update the DB like dcmanager would do. # Update the DB like dcmanager would do.
@@ -914,10 +915,10 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud state was not called # Verify the subcloud state was not called
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_called_once_with( assert_called_once_with(
mock.ANY, subcloud.name, subcloud.region_name, mock.ANY, subcloud.name, subcloud.region_name,
self.batch_state_request_data self.availability_data, self.endpoint_data
) )
# Verify alarm update is called only once # Verify alarm update is called only once
@@ -989,7 +990,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud state was not updated # Verify the subcloud state was not updated
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_not_called() assert_not_called()
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called
@@ -1108,10 +1109,10 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify that the subcloud was updated to offline # Verify that the subcloud was updated to offline
self._update_availability(dccommon_consts.AVAILABILITY_OFFLINE, False, 2) self._update_availability(dccommon_consts.AVAILABILITY_OFFLINE, False, 2)
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_called_with( assert_called_with(
mock.ANY, subcloud.name, subcloud.region_name, mock.ANY, subcloud.name, subcloud.region_name,
self.batch_state_request_data self.availability_data, self.endpoint_data
) )
def test_audit_subcloud_offline_update_audit_fail_count_only(self): def test_audit_subcloud_offline_update_audit_fail_count_only(self):
@@ -1173,7 +1174,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud state was not updated # Verify the subcloud state was not updated
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_not_called() assert_not_called()
# Verify the openstack endpoints were not updated # Verify the openstack endpoints were not updated
@@ -1246,7 +1247,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud state was not updated # Verify the subcloud state was not updated
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_not_called() assert_not_called()
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called
@@ -1314,7 +1315,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud state was not updated # Verify the subcloud state was not updated
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_not_called() assert_not_called()
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called
@@ -1381,7 +1382,7 @@ class TestAuditWorkerManager(base.DCManagerTestCase):
# Verify the subcloud state was not updated # Verify the subcloud state was not updated
self.mock_dcmanager_state_api().\ self.mock_dcmanager_state_api().\
batch_update_subcloud_availability_and_endpoint_status.\ bulk_update_subcloud_availability_and_endpoint_status.\
assert_not_called() assert_not_called()
# Verify the _update_subcloud_audit_fail_count is not called # Verify the _update_subcloud_audit_fail_count is not called

View File

@@ -2250,6 +2250,104 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
self.mock_dcmanager_api().subcloud_online.\ self.mock_dcmanager_api().subcloud_online.\
assert_called_once_with(self.ctx, self.subcloud.region_name) assert_called_once_with(self.ctx, self.subcloud.region_name)
def test_bulk_update_subcloud_availability_and_endpoint_status(self):
availability_data = {
"availability_status": dccommon_consts.AVAILABILITY_OFFLINE,
"update_state_only": False,
"audit_fail_count": 1
}
endpoint_data = {
dccommon_consts.ENDPOINT_TYPE_LOAD: dccommon_consts.SYNC_STATUS_IN_SYNC,
dccommon_consts.ENDPOINT_TYPE_FIRMWARE:
dccommon_consts.SYNC_STATUS_OUT_OF_SYNC
}
endpoints = db_api.subcloud_status_get_all(self.ctx, self.subcloud.id)
db_api.subcloud_update(
self.ctx, self.subcloud.id,
availability_status=dccommon_consts.AVAILABILITY_ONLINE,
management_state=dccommon_consts.MANAGEMENT_MANAGED
)
ssm = subcloud_state_manager.SubcloudStateManager()
ssm.bulk_update_subcloud_availability_and_endpoint_status(
self.ctx, self.subcloud.name, self.subcloud.region_name,
availability_data, endpoint_data
)
updated_subcloud = db_api.subcloud_get(self.ctx, self.subcloud.id)
self.assertEqual(
updated_subcloud.availability_status,
availability_data["availability_status"]
)
new_endpoints = db_api.subcloud_status_get_all(self.ctx, self.subcloud.id)
for index, endpoint in enumerate(endpoints):
self.assertEqual(
endpoint.endpoint_type, new_endpoints[index].endpoint_type
)
if endpoint.endpoint_type in endpoint_data:
self.assertEqual(
new_endpoints[index].sync_status,
endpoint_data[endpoint.endpoint_type]
)
else:
self.assertEqual(
endpoint.sync_status, new_endpoints[index].sync_status
)
@mock.patch.object(
db_api, "subcloud_status_bulk_update_endpoints",
wraps=db_api.subcloud_status_bulk_update_endpoints
)
def test_bulk_update_endpoint_status_when_endpoint_status_is_the_same(
self, mock_db
):
"""Test bulk_update_endpoint_status updates the endpoint with same status
When the endpoint's status in the database is the same as the one it'll be
updated to, ensure that, instead of validating, bulk_update_endpoint_status
sets the same value in the database
"""
db_api.subcloud_update(
self.ctx, self.subcloud.id,
availability_status=dccommon_consts.AVAILABILITY_ONLINE,
management_state=dccommon_consts.MANAGEMENT_MANAGED
)
endpoint_data = {
dccommon_consts.ENDPOINT_TYPE_LOAD: dccommon_consts.SYNC_STATUS_IN_SYNC,
dccommon_consts.ENDPOINT_TYPE_FIRMWARE:
dccommon_consts.SYNC_STATUS_OUT_OF_SYNC
}
ssm = subcloud_state_manager.SubcloudStateManager()
ssm.bulk_update_subcloud_availability_and_endpoint_status(
self.ctx, self.subcloud.name, self.subcloud.region_name,
None, endpoint_data
)
self.assertEqual(mock_db.call_count, 1)
# Re-executing the method should result in the same amount of call counts
# for the database query since there are no updates
ssm.bulk_update_subcloud_availability_and_endpoint_status(
self.ctx, self.subcloud.name, self.subcloud.region_name,
None, endpoint_data
)
self.assertEqual(mock_db.call_count, 2)
def test_bulk_update_fails_with_invalid_region(self):
ssm = subcloud_state_manager.SubcloudStateManager()
self.assertRaises(
exceptions.SubcloudRegionNameNotFound,
ssm.bulk_update_subcloud_availability_and_endpoint_status,
self.ctx, self.subcloud.name, "fake", None, None
)
@mock.patch.object(subcloud_state_manager.SubcloudStateManager, @mock.patch.object(subcloud_state_manager.SubcloudStateManager,
'_raise_or_clear_subcloud_status_alarm') '_raise_or_clear_subcloud_status_alarm')
def test_update_state_only(self, mock_update_status_alarm): def test_update_state_only(self, mock_update_status_alarm):
@@ -2336,11 +2434,7 @@ class TestSubcloudUpdate(BaseTestSubcloudManager):
ssm = subcloud_state_manager.SubcloudStateManager() ssm = subcloud_state_manager.SubcloudStateManager()
# create sync statuses for endpoints and set them to in-sync # create sync statuses for endpoints and set them to in-sync
for endpoint in [dccommon_consts.ENDPOINT_TYPE_PLATFORM, for endpoint in dccommon_consts.ENDPOINT_TYPES_LIST:
dccommon_consts.ENDPOINT_TYPE_IDENTITY,
dccommon_consts.ENDPOINT_TYPE_PATCHING,
dccommon_consts.ENDPOINT_TYPE_FM,
dccommon_consts.ENDPOINT_TYPE_NFV]:
db_api.subcloud_status_create( db_api.subcloud_status_create(
self.ctx, self.subcloud.id, endpoint) self.ctx, self.subcloud.id, endpoint)
ssm.update_subcloud_endpoint_status( ssm.update_subcloud_endpoint_status(