The master resource cache was previously only invalidated at the start
of an audit cycle. Since sync and audit tasks can be handled by
different workers, a worker that only performs syncs could use a stale
cache for an extended period.
This commit introduces a time-based expiry for the cache, set to the
audit check interval (300 seconds). The cache is now reset if it's
older than the expiry time. This check is performed at the beginning
of each sync and audit, ensuring cache freshness across all workers.
Test Plan:
- PASS: Soak subcloud audits and verify the cache is forcibly reset
at the start of each audit cycle.
- PASS: After an audit, create new Keystone roles using dcorch's
proxy. Verify that subsequent sync operations on workers
with expired caches trigger a cache reset, while workers
that recently ran an audit do not reset their cache until
it expires.
- PASS: Create a new identity resource right after the audit in a
scale environment. Verify it correctly resets the cache
after not finding the resource but also doesn't reset if
the previous reset was performed less than 5 seconds before.
Closes-bug: 2119981
Change-Id: I2859778219814769ccee490b9d6ed5056a4b8028
Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
1203 lines
48 KiB
Python
1203 lines
48 KiB
Python
# Copyright (c) 2017-2025 Wind River Systems, Inc.
|
|
# All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import collections
|
|
import datetime
|
|
import eventlet
|
|
import threading
|
|
|
|
from oslo_concurrency import lockutils
|
|
from oslo_config import cfg
|
|
from oslo_log import log as logging
|
|
from oslo_utils import timeutils
|
|
|
|
from dccommon import consts as dccommon_consts
|
|
from dccommon.drivers.openstack import sdk_platform as sdk
|
|
from dccommon.endpoint_cache import EndpointCache
|
|
from dccommon import utils as cutils
|
|
from dcdbsync.dbsyncclient import client as dbsyncclient
|
|
from dcmanager.rpc import client as dcmanager_rpc_client
|
|
from dcorch.common import consts
|
|
from dcorch.common import context
|
|
from dcorch.common import exceptions
|
|
from dcorch.common import utils
|
|
from dcorch.db import api as db_api
|
|
from dcorch.engine.fernet_key_manager import FERNET_REPO_MASTER_ID
|
|
from dcorch.objects import orchrequest
|
|
from dcorch.objects import resource
|
|
from dcorch.objects.subcloud import Subcloud
|
|
from dcorch.objects import subcloud_resource
|
|
|
|
|
|
from keystoneclient import client as keystoneclient
|
|
|
|
|
|
# Some of the variables defined in this file cannot be recognized by the
|
|
# current pylint check, thus will raise error which will fail tox check
|
|
# The pylint check is temporarily skipped on this file
|
|
# pylint: skip-file
|
|
LOG = logging.getLogger(__name__)
|
|
|
|
SYNC_TIMEOUT = 600 # Timeout for subcloud sync
|
|
|
|
# sync request states, should be in SyncRequest class
|
|
STATE_QUEUED = "queued"
|
|
STATE_IN_PROGRESS = "in-progress"
|
|
STATE_TIMEDOUT = "timedout"
|
|
STATE_ABORTED = "aborted"
|
|
STATE_FAILED = "failed"
|
|
STATE_COMPLETED = "completed"
|
|
|
|
# Audit findings
|
|
AUDIT_RESOURCE_MISSING = "missing"
|
|
AUDIT_RESOURCE_EXTRA = "extra_resource"
|
|
|
|
AUDIT_LOCK_NAME = "dcorch-audit"
|
|
|
|
|
|
def get_master_os_client(region_clients=None):
|
|
# Used by the master clients only. The subcloud clients don't need to be
|
|
# cached in the openstack driver, because we don't want to hold the admin
|
|
# sessions for the subclouds.
|
|
try:
|
|
os_client = sdk.OpenStackDriver(region_clients=region_clients)
|
|
except Exception as e:
|
|
LOG.error(
|
|
"Failed to get os_client for "
|
|
f"{cutils.get_region_one_name()}/{region_clients}: {e}."
|
|
)
|
|
raise e
|
|
return os_client
|
|
|
|
|
|
class SyncThread(object):
|
|
"""Manages tasks related to resource management."""
|
|
|
|
MAX_RETRY = 3
|
|
PENDING_SYNC_REQUEST_STATES = (
|
|
consts.ORCH_REQUEST_QUEUED,
|
|
consts.ORCH_REQUEST_IN_PROGRESS,
|
|
consts.ORCH_REQUEST_FAILED,
|
|
)
|
|
|
|
# used by the audit to cache the master resources
|
|
master_resources_dict = collections.defaultdict(dict)
|
|
master_resources_dict["last_reset_time"] = timeutils.utcnow()
|
|
|
|
def __init__(
|
|
self,
|
|
subcloud_name,
|
|
endpoint_type=None,
|
|
management_ip=None,
|
|
software_version=None,
|
|
subcloud_id=None,
|
|
engine_id=None,
|
|
):
|
|
self.endpoint_type = endpoint_type # endpoint type
|
|
self.subcloud_name = subcloud_name # subcloud name
|
|
self.management_ip = management_ip
|
|
self.software_version = software_version
|
|
self.subcloud_id = subcloud_id
|
|
self.engine_id = engine_id
|
|
self.ctxt = context.get_admin_context()
|
|
self.sync_handler_map = {}
|
|
self.master_region_name = cutils.get_region_one_name()
|
|
self.audit_resources = []
|
|
|
|
self.log_extra = {"instance": self.subcloud_name + ": "}
|
|
self.dcmanager_state_rpc_client = dcmanager_rpc_client.SubcloudStateClient()
|
|
self.dcmanager_rpc_client = dcmanager_rpc_client.ManagerClient()
|
|
|
|
self.sc_admin_session = None
|
|
self.sc_auth_url = None
|
|
self.admin_session = None
|
|
self.ks_client = None
|
|
self.dbs_client = None
|
|
|
|
def should_exit(self):
|
|
# Return whether the sync/audit threads should exit.
|
|
try:
|
|
db_api.subcloud_sync_get(self.ctxt, self.subcloud_name, self.endpoint_type)
|
|
except exceptions.SubcloudSyncNotFound:
|
|
return True
|
|
|
|
return False
|
|
|
|
def is_subcloud_managed(self):
|
|
# is this subcloud managed
|
|
subcloud = Subcloud.get_by_name(self.ctxt, self.subcloud_name)
|
|
return subcloud.management_state == dccommon_consts.MANAGEMENT_MANAGED
|
|
|
|
def is_subcloud_enabled(self):
|
|
# is this subcloud enabled
|
|
subcloud = Subcloud.get_by_name(self.ctxt, self.subcloud_name)
|
|
|
|
# We only enable syncing if the subcloud is online and the initial
|
|
# sync has completed.
|
|
if subcloud.availability_status == dccommon_consts.AVAILABILITY_ONLINE and (
|
|
subcloud.initial_sync_state == consts.INITIAL_SYNC_STATE_COMPLETED
|
|
):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def initialize(self):
|
|
# base implementation of initializing the master client.
|
|
# The specific SyncThread subclasses may extend this.
|
|
|
|
if self.endpoint_type in dccommon_consts.ENDPOINT_TYPES_LIST:
|
|
config = cfg.CONF.endpoint_cache
|
|
self.admin_session = EndpointCache.get_admin_session(
|
|
config.auth_uri,
|
|
timeout=60,
|
|
)
|
|
elif self.endpoint_type in dccommon_consts.ENDPOINT_TYPES_LIST_OS:
|
|
config = cfg.CONF.openstack_cache
|
|
self.admin_session = EndpointCache.get_admin_session(
|
|
config.auth_uri,
|
|
config.admin_username,
|
|
config.admin_user_domain_name,
|
|
config.admin_password,
|
|
config.admin_tenant,
|
|
config.admin_project_domain_name,
|
|
timeout=60,
|
|
)
|
|
else:
|
|
raise exceptions.EndpointNotSupported(endpoint=self.endpoint_type)
|
|
|
|
# keystone client
|
|
self.ks_client = keystoneclient.Client(
|
|
session=self.admin_session, region_name=cutils.get_region_one_name()
|
|
)
|
|
# dcdbsync client
|
|
self.dbs_client = dbsyncclient.Client(
|
|
endpoint_type=consts.DBS_ENDPOINT_INTERNAL,
|
|
session=self.admin_session,
|
|
region_name=cutils.get_region_one_name(),
|
|
)
|
|
|
|
def initialize_sc_clients(self):
|
|
# base implementation of initializing the subcloud specific
|
|
# clients, only used by the subclasses.
|
|
# The specific SyncThread subclasses may extend this
|
|
if not self.sc_admin_session:
|
|
# Subclouds will use token from the Subcloud specific Keystone,
|
|
# so define a session against that subcloud's keystone endpoint
|
|
self.sc_auth_url = cutils.build_subcloud_endpoint(
|
|
self.management_ip, dccommon_consts.ENDPOINT_NAME_KEYSTONE
|
|
)
|
|
LOG.debug(
|
|
f"Built sc_auth_url {self.sc_auth_url} for subcloud "
|
|
f"{self.subcloud_name}"
|
|
)
|
|
|
|
if self.endpoint_type in dccommon_consts.ENDPOINT_TYPES_LIST:
|
|
self.sc_admin_session = EndpointCache.get_admin_session(
|
|
self.sc_auth_url,
|
|
timeout=60,
|
|
)
|
|
elif self.endpoint_type in dccommon_consts.ENDPOINT_TYPES_LIST_OS:
|
|
config = cfg.CONF.openstack_cache
|
|
self.sc_admin_session = EndpointCache.get_admin_session(
|
|
self.sc_auth_url,
|
|
config.admin_username,
|
|
config.admin_user_domain_name,
|
|
config.admin_password,
|
|
config.admin_tenant,
|
|
config.admin_project_domain_name,
|
|
timeout=60,
|
|
)
|
|
|
|
def initial_sync(self):
|
|
# Return True to indicate initial sync success
|
|
return True
|
|
|
|
def enable(self):
|
|
# Called when DC manager thinks this subcloud is good to go.
|
|
self.run_sync_audit()
|
|
|
|
def get_db_subcloud_resource(self, rsrc_id):
|
|
try:
|
|
if self.subcloud_id is None:
|
|
self.subcloud_id = Subcloud.get_by_name(
|
|
self.ctxt, self.subcloud_name
|
|
).id
|
|
subcloud_rsrc = (
|
|
subcloud_resource.SubcloudResource.get_by_resource_and_subcloud(
|
|
self.ctxt, rsrc_id, self.subcloud_id
|
|
)
|
|
) # pylint: disable=E1101
|
|
return subcloud_rsrc
|
|
except exceptions.SubcloudResourceNotFound:
|
|
LOG.info(
|
|
"{} not found in subcloud {} resource table".format(
|
|
rsrc_id, self.subcloud_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
return None
|
|
|
|
def persist_db_subcloud_resource(self, db_rsrc_id, subcloud_rsrc_id):
|
|
# This function can be invoked after creating a subcloud resource.
|
|
# Persist the subcloud resource to the DB for later
|
|
#
|
|
# Parameters:
|
|
# db_rsrc_id: the "id" field of the resource in the DB
|
|
# subcloud_rsrc_id: the unique identifier of the subcloud resource
|
|
|
|
subcloud_rsrc = self.get_db_subcloud_resource(db_rsrc_id)
|
|
if not subcloud_rsrc:
|
|
if self.subcloud_id is None:
|
|
self.subcloud_id = Subcloud.get_by_name(
|
|
self.ctxt, self.subcloud_name
|
|
).id
|
|
subcloud_rsrc = subcloud_resource.SubcloudResource(
|
|
self.ctxt,
|
|
subcloud_resource_id=subcloud_rsrc_id,
|
|
resource_id=db_rsrc_id,
|
|
subcloud_id=self.subcloud_id,
|
|
) # pylint: disable=E1101
|
|
# There is no race condition for creation of
|
|
# subcloud_resource as it is always done from the same thread.
|
|
subcloud_rsrc.create()
|
|
elif subcloud_rsrc.subcloud_resource_id != subcloud_rsrc_id:
|
|
# May be the resource was manually deleted from the subcloud.
|
|
# So, update the dcorch DB with the new resource id from subcloud.
|
|
subcloud_rsrc.subcloud_resource_id = subcloud_rsrc_id
|
|
LOG.info(
|
|
"Updating {}:{} [{}]".format(
|
|
db_rsrc_id, subcloud_rsrc.subcloud_resource_id, subcloud_rsrc_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
subcloud_rsrc.save()
|
|
else:
|
|
LOG.info(
|
|
"subcloud_rsrc {}:{} [{}] is up-to-date".format(
|
|
db_rsrc_id, subcloud_rsrc.subcloud_resource_id, subcloud_rsrc_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
return subcloud_rsrc.subcloud_resource_id
|
|
|
|
def sync_resource(self, sync_request):
|
|
rsrc = resource.Resource.get_by_id(self.ctxt, sync_request.orch_job.resource_id)
|
|
# pylint: disable=E1101
|
|
handler = self.sync_handler_map[rsrc.resource_type]
|
|
LOG.info(
|
|
"{} Invoking {} for {} [{}]".format(
|
|
self.engine_id,
|
|
handler.__name__,
|
|
rsrc.resource_type,
|
|
sync_request.orch_job.operation_type,
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
handler(sync_request, rsrc)
|
|
|
|
def set_sync_status(self, sync_status, alarmable=True):
|
|
# Only report sync_status when managed
|
|
subcloud_managed = self.is_subcloud_managed()
|
|
if not subcloud_managed:
|
|
LOG.debug(
|
|
"set_sync_status: skip update sync update for unmanaged "
|
|
"subcloud {}".format(self.subcloud_name)
|
|
)
|
|
return
|
|
|
|
subcloud_sync = db_api.subcloud_sync_get(
|
|
self.ctxt, self.subcloud_name, self.endpoint_type
|
|
)
|
|
|
|
if subcloud_sync.sync_status_report_time:
|
|
delta = timeutils.delta_seconds(
|
|
subcloud_sync.sync_status_report_time, timeutils.utcnow()
|
|
)
|
|
if delta < 3600:
|
|
if subcloud_sync.sync_status_reported == sync_status:
|
|
LOG.debug(
|
|
"skip set_sync_status sync_status_reported={}, "
|
|
"sync_status={}".format(
|
|
subcloud_sync.sync_status_reported, sync_status
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
return
|
|
|
|
LOG.info(
|
|
"{}: set_sync_status {}, alarmable: {}".format(
|
|
self.subcloud_name, sync_status, alarmable
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
|
|
self.dcmanager_state_rpc_client.update_subcloud_endpoint_status(
|
|
self.ctxt,
|
|
subcloud_region=self.subcloud_name,
|
|
endpoint_type=self.endpoint_type,
|
|
sync_status=sync_status,
|
|
alarmable=alarmable,
|
|
)
|
|
|
|
db_api.subcloud_sync_update(
|
|
self.ctxt,
|
|
self.subcloud_name,
|
|
self.endpoint_type,
|
|
values={
|
|
"sync_status_reported": sync_status,
|
|
"sync_status_report_time": timeutils.utcnow(),
|
|
},
|
|
)
|
|
|
|
def sync(self):
|
|
LOG.debug(
|
|
"{}: starting sync routine".format(self.subcloud_name), extra=self.log_extra
|
|
)
|
|
region_name = self.subcloud_name
|
|
|
|
sync_requests = orchrequest.OrchRequestList.get_by_attrs(
|
|
self.ctxt,
|
|
self.endpoint_type,
|
|
target_region_name=region_name,
|
|
states=self.PENDING_SYNC_REQUEST_STATES,
|
|
)
|
|
|
|
# Early exit in case there are no pending sync requests
|
|
if not sync_requests:
|
|
LOG.debug(
|
|
"Sync resources done for subcloud - no sync requests",
|
|
extra=self.log_extra,
|
|
)
|
|
self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
|
|
return
|
|
|
|
LOG.info(
|
|
"Got {} sync request(s)".format(len(sync_requests)),
|
|
extra=self.log_extra,
|
|
)
|
|
|
|
actual_sync_requests = []
|
|
for req in sync_requests:
|
|
# Failed orch requests were taken into consideration when reporting
|
|
# sync status to the dcmanager. They need to be removed from the
|
|
# orch requests list before proceeding.
|
|
if req.state != consts.ORCH_REQUEST_STATE_FAILED:
|
|
actual_sync_requests.append(req)
|
|
|
|
if not actual_sync_requests:
|
|
LOG.info(
|
|
"Sync resources done for subcloud - no valid sync requests",
|
|
extra=self.log_extra,
|
|
)
|
|
# We got FAILED requests, set sync_status=out-of-sync
|
|
self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
|
|
return
|
|
elif not self.is_subcloud_enabled():
|
|
LOG.info(
|
|
"Sync resources done for subcloud - subcloud is disabled",
|
|
extra=self.log_extra,
|
|
)
|
|
self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
|
|
return
|
|
|
|
# Subcloud is enabled and there are pending sync requests, so
|
|
# we have work to do.
|
|
request_aborted = False
|
|
timeout = eventlet.timeout.Timeout(SYNC_TIMEOUT)
|
|
try:
|
|
for request in actual_sync_requests:
|
|
if db_api.should_stop_subcloud_sync(
|
|
self.ctxt, self.subcloud_name, self.endpoint_type
|
|
):
|
|
# Oops, someone disabled the endpoint while
|
|
# we were processing work for it.
|
|
raise exceptions.EndpointNotReachable()
|
|
request.state = consts.ORCH_REQUEST_STATE_IN_PROGRESS
|
|
try:
|
|
request.save() # save to DB
|
|
except exceptions.OrchRequestNotFound:
|
|
# This case is handled in loop below, but should also be
|
|
# handled here as well.
|
|
LOG.info(
|
|
"Orch request already deleted request uuid=%s state=%s"
|
|
% (request.uuid, request.state),
|
|
extra=self.log_extra,
|
|
)
|
|
continue
|
|
|
|
retry_count = 0
|
|
while retry_count < self.MAX_RETRY:
|
|
try:
|
|
self.sync_resource(request)
|
|
# Sync succeeded, mark the request as
|
|
# completed for tracking/debugging purpose
|
|
# and tag it for purge when its deleted
|
|
# time exceeds the data retention period.
|
|
request.state = consts.ORCH_REQUEST_STATE_COMPLETED
|
|
request.deleted = 1
|
|
request.deleted_at = timeutils.utcnow()
|
|
request.save()
|
|
break
|
|
except exceptions.OrchRequestNotFound:
|
|
LOG.info(
|
|
"Orch request already deleted request uuid=%s state=%s"
|
|
% (request.uuid, request.state),
|
|
extra=self.log_extra,
|
|
)
|
|
break
|
|
except exceptions.SyncRequestTimeout:
|
|
request.try_count += 1
|
|
request.save()
|
|
retry_count += 1
|
|
if retry_count >= self.MAX_RETRY:
|
|
raise exceptions.EndpointNotReachable()
|
|
except exceptions.SyncRequestFailedRetry:
|
|
LOG.info(
|
|
"SyncRequestFailedRetry for {}/{}".format(
|
|
self.subcloud_name, self.endpoint_type
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
request.try_count += 1
|
|
request.state = consts.ORCH_REQUEST_STATE_FAILED
|
|
request.save()
|
|
retry_count += 1
|
|
|
|
# Incremental backoff retry is implemented to define the wait
|
|
# time between each attempt to retry the sync.
|
|
# 1st retry: 1s.
|
|
# 2nd retry: 3s.
|
|
if retry_count < self.MAX_RETRY:
|
|
# Only sleep if this is not the last retry
|
|
sleep_duration = 1 + (retry_count - 1) * 2
|
|
eventlet.greenthread.sleep(sleep_duration)
|
|
else:
|
|
LOG.error(
|
|
"SyncRequestFailedRetry: max retries reached "
|
|
"for {}/{}".format(
|
|
self.subcloud_name, self.endpoint_type
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
except exceptions.SyncRequestFailed:
|
|
LOG.error(
|
|
"SyncRequestFailed for {}/{}".format(
|
|
self.subcloud_name, self.endpoint_type
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
request.state = consts.ORCH_REQUEST_STATE_FAILED
|
|
request.save()
|
|
retry_count = self.MAX_RETRY
|
|
request_aborted = True
|
|
except exceptions.SyncRequestAbortedBySystem:
|
|
request.state = consts.ORCH_REQUEST_STATE_FAILED
|
|
request.save()
|
|
retry_count = self.MAX_RETRY
|
|
request_aborted = True
|
|
except Exception as e:
|
|
LOG.error(
|
|
f"Unexpected error during sync: {e}",
|
|
extra=self.log_extra,
|
|
)
|
|
request.state = consts.ORCH_REQUEST_STATE_FAILED
|
|
request.save()
|
|
retry_count = self.MAX_RETRY
|
|
|
|
# If we fall out of the retry loop we either succeeded
|
|
# or failed multiple times and want to move to the next
|
|
# request.
|
|
|
|
except eventlet.timeout.Timeout:
|
|
# The entire sync operation timed out, covering all sync requests.
|
|
# Just log the exception and continue to check if there are
|
|
# pending requests.
|
|
LOG.exception(
|
|
f"Sync timed out for {self.subcloud_name}/{self.endpoint_type}."
|
|
)
|
|
|
|
except exceptions.EndpointNotReachable:
|
|
# Endpoint not reachable, throw away all the sync requests.
|
|
LOG.info(
|
|
"EndpointNotReachable, {} sync requests pending".format(
|
|
len(actual_sync_requests)
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
# del sync_requests[:] #This fails due to:
|
|
# 'OrchRequestList' object does not support item deletion
|
|
|
|
finally:
|
|
timeout.cancel()
|
|
|
|
sync_requests = orchrequest.OrchRequestList.get_by_attrs(
|
|
self.ctxt,
|
|
self.endpoint_type,
|
|
target_region_name=region_name,
|
|
states=self.PENDING_SYNC_REQUEST_STATES,
|
|
)
|
|
|
|
alarmable = False
|
|
for req in sync_requests:
|
|
# Any failed state should be alarmable
|
|
if req.state == consts.ORCH_REQUEST_STATE_FAILED:
|
|
alarmable = True
|
|
|
|
# Do not raise an alarm if all the sync requests are due to
|
|
# a fernet key rotation, as these are expected to occur
|
|
# periodically.
|
|
if req.orch_job.source_resource_id != FERNET_REPO_MASTER_ID:
|
|
alarmable = True
|
|
|
|
# If there are pending requests, update the status to out-of-sync.
|
|
if sync_requests:
|
|
# If the request was aborted due to an expired certificate,
|
|
# update the status to 'out-of-sync' and just return so the
|
|
# sync_request is updated to "completed". This way, the sync
|
|
# job won't attempt to retry the sync in the next cycle.
|
|
if request_aborted:
|
|
self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
|
|
LOG.info(
|
|
"End of resource sync out-of-sync. {} sync request(s): "
|
|
"request_aborted".format(len(sync_requests)),
|
|
extra=self.log_extra,
|
|
)
|
|
return
|
|
# Otherwise, e.g. timeout or EndpointNotReachable,
|
|
# update the status and raise an exception to set the sync_request to
|
|
# 'failed', so the sync job will re-attempt the sync in the next
|
|
# sync cycle.
|
|
else:
|
|
self.set_sync_status(
|
|
dccommon_consts.SYNC_STATUS_OUT_OF_SYNC, alarmable=alarmable
|
|
)
|
|
LOG.info(
|
|
"End of resource sync out-of-sync. {} sync request(s)".format(
|
|
len(sync_requests)
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
msg = (
|
|
f"There are {len(sync_requests)} pending requests to sync. "
|
|
"Will retry in next sync cycle."
|
|
)
|
|
raise Exception(msg)
|
|
|
|
else:
|
|
self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
|
|
LOG.info(
|
|
"End of resource sync in-sync. {} sync request(s)".format(
|
|
len(sync_requests)
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
|
|
LOG.info(
|
|
"Sync resources done for subcloud - "
|
|
"synced {} request(s)".format(len(actual_sync_requests)),
|
|
extra=self.log_extra,
|
|
)
|
|
|
|
def run_sync_audit(self, engine_id=None):
|
|
if self.endpoint_type in cfg.CONF.disable_audit_endpoints:
|
|
LOG.warn("Audit disabled!", extra=self.log_extra)
|
|
return
|
|
LOG.debug(
|
|
"Engine id={}: sync_audit started".format(engine_id), extra=self.log_extra
|
|
)
|
|
try:
|
|
self.sync_audit(engine_id)
|
|
finally:
|
|
self.post_audit()
|
|
|
|
def sync_audit(self, engine_id):
|
|
LOG.debug(
|
|
"Engine id={}: starting sync audit".format(engine_id), extra=self.log_extra
|
|
)
|
|
|
|
most_recent_failed_request = (
|
|
orchrequest.OrchRequest.get_most_recent_failed_request(self.ctxt)
|
|
)
|
|
|
|
if most_recent_failed_request:
|
|
LOG.debug(
|
|
"Most recent failed request id=%s, timestamp=%s",
|
|
most_recent_failed_request.id,
|
|
most_recent_failed_request.updated_at,
|
|
)
|
|
else:
|
|
LOG.debug("There are no failed requests.")
|
|
|
|
total_num_of_audit_jobs = 0
|
|
|
|
# TODO(ecandotti): move this behavior to SysinvSyncThread class
|
|
|
|
# If the endpoint is of type Platform and the subcloud has dcagent,
|
|
# retrieve all platform resources with a single dcagent call to avoid
|
|
# making separate get_dcagent_resources calls for each resource type.
|
|
if self.endpoint_type == dccommon_consts.ENDPOINT_TYPE_PLATFORM and (
|
|
self.has_dcagent
|
|
):
|
|
all_master_resources = dict()
|
|
for resource_type in self.audit_resources:
|
|
all_master_resources[resource_type] = self.get_cached_master_resources(
|
|
resource_type
|
|
)
|
|
platform_resources = self.get_dcagent_resources(
|
|
self.audit_resources, all_master_resources
|
|
)
|
|
if platform_resources is None:
|
|
# If subcloud is not reachable, abort audit.
|
|
return
|
|
|
|
for resource_type in self.audit_resources:
|
|
if db_api.should_stop_subcloud_sync(
|
|
self.ctxt, self.subcloud_name, self.endpoint_type
|
|
):
|
|
LOG.info(
|
|
"{}: aborting sync audit, as subcloud is disabled".format(
|
|
threading.currentThread().getName()
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
return
|
|
|
|
# Skip resources with outstanding sync requests
|
|
region_name = self.subcloud_name
|
|
sync_requests = []
|
|
states = [
|
|
consts.ORCH_REQUEST_QUEUED,
|
|
consts.ORCH_REQUEST_IN_PROGRESS,
|
|
]
|
|
sync_requests = orchrequest.OrchRequestList.get_by_attrs(
|
|
self.ctxt,
|
|
self.endpoint_type,
|
|
resource_type=resource_type,
|
|
target_region_name=region_name,
|
|
states=states,
|
|
)
|
|
abort_resources = [req.orch_job.source_resource_id for req in sync_requests]
|
|
if len(sync_requests) > 0:
|
|
LOG.info(
|
|
"Will not audit {}. {} sync request(s) pending".format(
|
|
abort_resources, len(sync_requests)
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
|
|
num_of_audit_jobs = 0
|
|
try:
|
|
m_resources, db_resources, sc_resources = self.get_all_resources(
|
|
resource_type
|
|
)
|
|
|
|
if self.endpoint_type == dccommon_consts.ENDPOINT_TYPE_PLATFORM and (
|
|
self.has_dcagent
|
|
):
|
|
sc_resources = platform_resources[resource_type]
|
|
|
|
# todo: delete entries in db_resources with no corresponding
|
|
# entry in m_resources?
|
|
|
|
if sc_resources is None or m_resources is None:
|
|
return
|
|
LOG.debug("Audit {}".format(resource_type), extra=self.log_extra)
|
|
LOG.debug(
|
|
"Auditing {}: master={} db={} sc={}".format(
|
|
resource_type, m_resources, db_resources, sc_resources
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
num_of_audit_jobs += self.audit_find_missing(
|
|
resource_type,
|
|
m_resources,
|
|
db_resources,
|
|
sc_resources,
|
|
abort_resources,
|
|
)
|
|
num_of_audit_jobs += self.audit_find_extra(
|
|
resource_type,
|
|
m_resources,
|
|
db_resources,
|
|
sc_resources,
|
|
abort_resources,
|
|
)
|
|
except Exception:
|
|
LOG.exception("Unexpected error while auditing %s", resource_type)
|
|
|
|
# Extra resources in subcloud are not impacted by the audit.
|
|
|
|
if not num_of_audit_jobs:
|
|
LOG.debug(
|
|
"Clean audit run for {}".format(resource_type), extra=self.log_extra
|
|
)
|
|
else:
|
|
LOG.info(
|
|
"{} num_of_audit_jobs for {}".format(
|
|
num_of_audit_jobs, resource_type
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
|
|
total_num_of_audit_jobs += num_of_audit_jobs
|
|
|
|
if most_recent_failed_request:
|
|
# Soft delete all failed requests in the previous sync audit.
|
|
try:
|
|
orchrequest.OrchRequest.delete_previous_failed_requests(
|
|
self.ctxt, most_recent_failed_request.updated_at
|
|
)
|
|
except Exception:
|
|
# shouldn't get here
|
|
LOG.exception("Unexpected error!")
|
|
|
|
if not total_num_of_audit_jobs:
|
|
self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
|
|
|
|
else:
|
|
db_api.subcloud_sync_update(
|
|
self.ctxt,
|
|
self.subcloud_name,
|
|
self.endpoint_type,
|
|
values={"sync_request": consts.SYNC_STATUS_REQUESTED},
|
|
)
|
|
|
|
LOG.debug(
|
|
"{}: done sync audit".format(threading.currentThread().getName()),
|
|
extra=self.log_extra,
|
|
)
|
|
|
|
def post_audit(self):
|
|
# Some specific SyncThread subclasses may perform post audit actions
|
|
utils.close_session(
|
|
self.sc_admin_session, "audit", f"{self.subcloud_name}/{self.endpoint_type}"
|
|
)
|
|
|
|
def audit_find_missing(
|
|
self, resource_type, m_resources, db_resources, sc_resources, abort_resources
|
|
):
|
|
"""Find missing resources in subcloud.
|
|
|
|
- Input param db_resources is modified in this routine
|
|
to remove entries that match the resources in
|
|
master cloud. At the end, db_resources will have a
|
|
list of resources that are present in dcorch DB, but
|
|
not present in the master cloud.
|
|
"""
|
|
num_of_audit_jobs = 0
|
|
for m_r in m_resources:
|
|
master_id = self.get_resource_id(resource_type, m_r)
|
|
if master_id in abort_resources:
|
|
LOG.info(
|
|
"audit_find_missing: Aborting audit for {}".format(master_id),
|
|
extra=self.log_extra,
|
|
)
|
|
num_of_audit_jobs += 1
|
|
# There are pending jobs for this resource, abort audit
|
|
continue
|
|
|
|
missing_resource = False
|
|
m_rsrc_db = None
|
|
for db_resource in db_resources:
|
|
if db_resource.master_id == master_id:
|
|
m_rsrc_db = db_resource
|
|
db_resources.remove(db_resource)
|
|
break
|
|
|
|
if m_rsrc_db:
|
|
# resource from master cloud is present in DB.
|
|
|
|
# Contents of "m_r" may refer to other master cloud resources.
|
|
# Make a copy with the references updated to refer to subcloud
|
|
# resources.
|
|
try:
|
|
m_r_updated = self.update_resource_refs(resource_type, m_r)
|
|
except exceptions.SubcloudResourceNotFound:
|
|
# If we couldn't find the equivalent subcloud resources,
|
|
# we don't know what to look for in the subcloud so skip
|
|
# this m_r and go to the next one.
|
|
continue
|
|
|
|
# Now, look for subcloud resource in DB.
|
|
# If present: look for actual resource in the
|
|
# subcloud and compare the resource details.
|
|
# If not present: create resource in subcloud.
|
|
db_sc_resource = self.get_db_subcloud_resource(m_rsrc_db.id)
|
|
if db_sc_resource:
|
|
if not db_sc_resource.is_managed():
|
|
LOG.info(
|
|
"Resource {} is not managed".format(master_id),
|
|
extra=self.log_extra,
|
|
)
|
|
continue
|
|
sc_rsrc_present = False
|
|
# The subcloud resource will only have "in-sync" or "out-of-sync"
|
|
# if returned by dcagent. For platform resources, audit_dependants
|
|
# will always return 0.
|
|
if self.is_dcagent_managed_resource():
|
|
sc_rsrc_present = self.is_resource_present_in_subcloud(
|
|
resource_type, master_id, sc_resources
|
|
)
|
|
else:
|
|
for sc_r in sc_resources:
|
|
sc_id = self.get_resource_id(resource_type, sc_r)
|
|
if sc_id == db_sc_resource.subcloud_resource_id:
|
|
if self.same_resource(resource_type, m_r_updated, sc_r):
|
|
LOG.debug(
|
|
"Resource type {} {} is in-sync".format(
|
|
resource_type, master_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
num_of_audit_jobs += self.audit_dependants(
|
|
resource_type, m_r, sc_r
|
|
)
|
|
sc_rsrc_present = True
|
|
break
|
|
if not sc_rsrc_present:
|
|
LOG.info(
|
|
"Subcloud resource {} found in master cloud & DB, "
|
|
"but the exact same resource not found in subcloud".format(
|
|
db_sc_resource.subcloud_resource_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
# Subcloud resource is present in DB, but the check
|
|
# for same_resource() was negative. Either the resource
|
|
# disappeared from subcloud or the resource details
|
|
# are different from that of master cloud. Let the
|
|
# resource implementation decide on the audit action.
|
|
missing_resource = self.audit_discrepancy(
|
|
resource_type, m_r, sc_resources
|
|
)
|
|
else:
|
|
LOG.info(
|
|
"Subcloud res {} not found in DB, will create".format(
|
|
master_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
# Check and see if there are any subcloud resources that
|
|
# match the master resource, and if so set up mappings.
|
|
# This returns true if it finds a match.
|
|
if self.map_subcloud_resource(
|
|
resource_type, m_r_updated, m_rsrc_db, sc_resources
|
|
):
|
|
continue
|
|
missing_resource = True
|
|
|
|
else: # master_resource not in resource DB
|
|
LOG.info(
|
|
"{} not found in DB, will create it".format(master_id),
|
|
extra=self.log_extra,
|
|
)
|
|
# Check and see if there are any subcloud resources that
|
|
# match the master resource, and if so set up mappings.
|
|
# This returns true if it finds a match.
|
|
# This is for the case where the resource is not even in dcorch
|
|
# resource DB (ie, resource has not been tracked by dcorch yet)
|
|
if self.map_subcloud_resource(
|
|
resource_type, m_r, m_rsrc_db, sc_resources
|
|
):
|
|
continue
|
|
missing_resource = True
|
|
|
|
if missing_resource:
|
|
# Resource is missing from subcloud, take action
|
|
num_of_audit_jobs += self.audit_action(
|
|
resource_type, AUDIT_RESOURCE_MISSING, m_r
|
|
)
|
|
|
|
# As the subcloud resource is missing, invoke
|
|
# the hook for dependants with no subcloud resource.
|
|
# Resource implementation should handle this.
|
|
num_of_audit_jobs += self.audit_dependants(resource_type, m_r, None)
|
|
if num_of_audit_jobs != 0:
|
|
LOG.info(
|
|
"audit_find_missing {} num_of_audit_jobs".format(num_of_audit_jobs),
|
|
extra=self.log_extra,
|
|
)
|
|
return num_of_audit_jobs
|
|
|
|
def audit_find_extra(
|
|
self, resource_type, m_resources, db_resources, sc_resources, abort_resources
|
|
):
|
|
"""Find extra resources in subcloud.
|
|
|
|
- Input param db_resources is expected to be a
|
|
list of resources that are present in dcorch DB, but
|
|
not present in the master cloud.
|
|
"""
|
|
|
|
num_of_audit_jobs = 0
|
|
# At this point, db_resources contains resources present in DB,
|
|
# but not in master cloud
|
|
for db_resource in db_resources:
|
|
if db_resource.master_id:
|
|
if db_resource.master_id in abort_resources:
|
|
LOG.info(
|
|
"audit_find_extra: Aborting audit for {}".format(
|
|
db_resource.master_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
num_of_audit_jobs += 1
|
|
# There are pending jobs for this resource, abort audit
|
|
continue
|
|
|
|
LOG.debug(
|
|
"Extra resource ({}) in DB".format(db_resource.id),
|
|
extra=self.log_extra,
|
|
)
|
|
subcloud_rsrc = self.get_db_subcloud_resource(db_resource.id)
|
|
if subcloud_rsrc:
|
|
if not subcloud_rsrc.is_managed():
|
|
LOG.info(
|
|
"Resource {} is not managed".format(
|
|
subcloud_rsrc.subcloud_resource_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
continue
|
|
|
|
# check if the resource exists in subcloud, no need to
|
|
# schedule work if it doesn't exist in subcloud.
|
|
# This is a precautionary action in case the resource
|
|
# has already be deleted in the subcloud which can happen
|
|
# for example, user deletes the resource from master right
|
|
# after an audit (not through api-proxy), then user deletes
|
|
# that resource manually in the subcloud before the
|
|
# next audit.
|
|
if not self.resource_exists_in_subcloud(
|
|
subcloud_rsrc, sc_resources
|
|
):
|
|
continue
|
|
|
|
LOG.info(
|
|
"Resource ({}) and subcloud resource ({}) "
|
|
"not in sync with master cloud".format(
|
|
db_resource.master_id, subcloud_rsrc.subcloud_resource_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
# There is extra resource in the subcloud, take action.
|
|
# Note that the resource is in dcorch DB, but not
|
|
# actually present in the master cloud.
|
|
num_of_audit_jobs += self.audit_action(
|
|
resource_type, AUDIT_RESOURCE_EXTRA, db_resource
|
|
)
|
|
else:
|
|
# Resource is present in resource table, but not in
|
|
# subcloud_resource table. We have also established that
|
|
# the corresponding OpenStack resource is not present in
|
|
# the master cloud.
|
|
# There might be another subcloud with "unmanaged"
|
|
# subcloud resource corresponding to this resource.
|
|
# So, just ignore this here!
|
|
pass
|
|
|
|
return num_of_audit_jobs
|
|
|
|
def schedule_work(
|
|
self,
|
|
endpoint_type,
|
|
resource_type,
|
|
source_resource_id,
|
|
operation_type,
|
|
resource_info=None,
|
|
):
|
|
LOG.info(
|
|
"Scheduling {} work for {}/{}".format(
|
|
operation_type, resource_type, source_resource_id
|
|
),
|
|
extra=self.log_extra,
|
|
)
|
|
try:
|
|
subcloud = Subcloud.get_by_name(self.ctxt, self.subcloud_name)
|
|
utils.enqueue_work(
|
|
self.ctxt,
|
|
endpoint_type,
|
|
resource_type,
|
|
source_resource_id,
|
|
operation_type,
|
|
resource_info,
|
|
subcloud=subcloud,
|
|
)
|
|
except Exception as e:
|
|
LOG.info(
|
|
"Exception in schedule_work: {}".format(str(e)), extra=self.log_extra
|
|
)
|
|
|
|
def get_resource_id(self, resource_type, resource):
|
|
if hasattr(resource, "master_id"):
|
|
# If resource from DB, return master resource id
|
|
# from master cloud
|
|
return resource.master_id
|
|
else:
|
|
# Else, return id field (by default)
|
|
return resource.id
|
|
|
|
@classmethod
|
|
@lockutils.synchronized(AUDIT_LOCK_NAME)
|
|
def reset_master_resources_cache(cls, force_reset: bool = False) -> None:
|
|
"""Reset the cached master resources if needed.
|
|
|
|
The cached master resources are reset if:
|
|
1) force_reset is True, or
|
|
2) the last reset time is older than CHECK_AUDIT_INTERVAL (300 seconds).
|
|
|
|
The cache if not reset if the last reset time is newer than
|
|
CHECK_AUDIT_INTERVAL (5 seconds). This is done to prevent unnecessary
|
|
resets of the cache in a multi-threaded environment where
|
|
multiple threads may call reset_master_resources_cache and
|
|
get_cached_master_resources, so we need to ensure that the cache
|
|
is not reset based on old information in between these calls.
|
|
|
|
:param force_reset: Boolean flag to force reset the cache.
|
|
"""
|
|
last_reset_time = SyncThread.master_resources_dict.get("last_reset_time")
|
|
fresh_cache = (
|
|
last_reset_time
|
|
and timeutils.utcnow() - last_reset_time
|
|
<= datetime.timedelta(seconds=consts.CHECK_SYNC_INTERVAL)
|
|
)
|
|
if fresh_cache:
|
|
LOG.debug("Cache is still fresh, not resetting it.")
|
|
return
|
|
invalid_cache = (
|
|
last_reset_time
|
|
and timeutils.utcnow() - last_reset_time
|
|
> datetime.timedelta(seconds=consts.CHECK_AUDIT_INTERVAL)
|
|
)
|
|
if force_reset or invalid_cache:
|
|
LOG.debug("Reset the cached master resources.")
|
|
SyncThread.master_resources_dict = collections.defaultdict(dict)
|
|
SyncThread.master_resources_dict["last_reset_time"] = timeutils.utcnow()
|
|
else:
|
|
LOG.debug("Cached master resources are still valid, not resetting it.")
|
|
|
|
# Audit functions to be overridden in inherited classes
|
|
def get_all_resources(self, resource_type):
|
|
m_resources = None
|
|
db_resources = None
|
|
sc_resources = None
|
|
# Get resources from dcdbsync if the endpoint is not platform or it is
|
|
# but the subcloud doesn't support dcagent. In case it has dcagent,
|
|
# the subcloud resources have already been retrieved for all platform
|
|
# resources previously
|
|
if self.endpoint_type != dccommon_consts.ENDPOINT_TYPE_PLATFORM or not (
|
|
self.has_dcagent
|
|
):
|
|
sc_resources = self.get_subcloud_resources(resource_type)
|
|
# If subcloud is not reachable, abort audit.
|
|
if sc_resources is None:
|
|
return m_resources, db_resources, sc_resources
|
|
db_resources = self.get_db_master_resources(resource_type)
|
|
m_resources = self.get_cached_master_resources(resource_type)
|
|
return m_resources, db_resources, sc_resources
|
|
|
|
@lockutils.synchronized(AUDIT_LOCK_NAME)
|
|
def get_cached_master_resources(self, resource_type):
|
|
if resource_type in SyncThread.master_resources_dict:
|
|
m_resources = SyncThread.master_resources_dict[resource_type]
|
|
else:
|
|
m_resources = self.get_master_resources(resource_type)
|
|
if m_resources is not None:
|
|
SyncThread.master_resources_dict[resource_type] = m_resources
|
|
return m_resources
|
|
|
|
def get_subcloud_resources(self, resource_type):
|
|
return None
|
|
|
|
def get_db_master_resources(self, resource_type):
|
|
return list(resource.ResourceList.get_all(self.ctxt, resource_type))
|
|
|
|
def get_master_resources(self, resource_type):
|
|
return None
|
|
|
|
def same_resource(self, resource_type, m_resource, sc_resource):
|
|
return True
|
|
|
|
def has_same_ids(self, resource_type, m_resource, sc_resource):
|
|
return False
|
|
|
|
def is_dcagent_managed_resource(self):
|
|
return False
|
|
|
|
def is_resource_present_in_subcloud(self, resource_type, master_id, sc_resources):
|
|
return False
|
|
|
|
def map_subcloud_resource(self, resource_type, m_r, m_rsrc_db, sc_resources):
|
|
# Child classes can override this function to map an existing subcloud
|
|
# resource to an existing master resource. If a mapping is created
|
|
# the function should return True.
|
|
#
|
|
# It is expected that update_resource_refs() has been called on m_r.
|
|
return False
|
|
|
|
def update_resource_refs(self, resource_type, m_r):
|
|
# Child classes can override this function to update any references
|
|
# to other master resources embedded within the info of this resource.
|
|
return m_r
|
|
|
|
def audit_dependants(self, resource_type, m_resource, sc_resource):
|
|
return 0
|
|
|
|
def audit_discrepancy(self, resource_type, m_resource, sc_resources):
|
|
# Return true to try creating the resource again
|
|
return True
|
|
|
|
def audit_action(self, resource_type, finding, resource):
|
|
LOG.info(
|
|
"audit_action: {}/{}".format(finding, resource_type), extra=self.log_extra
|
|
)
|
|
# Default actions are create & delete. Can be overridden
|
|
# in resource implementation
|
|
num_of_audit_jobs = 0
|
|
# resource can be either from dcorch DB or fetched by OpenStack query
|
|
resource_id = self.get_resource_id(resource_type, resource)
|
|
if finding == AUDIT_RESOURCE_MISSING:
|
|
# default action is create for a 'missing' resource
|
|
self.schedule_work(
|
|
self.endpoint_type,
|
|
resource_type,
|
|
resource_id,
|
|
consts.OPERATION_TYPE_CREATE,
|
|
self.get_resource_info(
|
|
resource_type, resource, consts.OPERATION_TYPE_CREATE
|
|
),
|
|
)
|
|
num_of_audit_jobs += 1
|
|
elif finding == AUDIT_RESOURCE_EXTRA:
|
|
# default action is delete for an 'extra_resource'
|
|
# resource passed in is db_resource (resource in dcorch DB)
|
|
self.schedule_work(
|
|
self.endpoint_type,
|
|
resource_type,
|
|
resource_id,
|
|
consts.OPERATION_TYPE_DELETE,
|
|
)
|
|
num_of_audit_jobs += 1
|
|
return num_of_audit_jobs
|
|
|
|
def get_resource_info(self, resource_type, resource, operation_type=None):
|
|
return ""
|
|
|
|
# check if the subcloud resource (from dcorch subcloud_resource table)
|
|
# exists in subcloud resources.
|
|
def resource_exists_in_subcloud(self, subcloud_rsrc, sc_resources):
|
|
return True
|