Merge "Filter cert-mon for geo-redundancy in audit and DC_CertWatcher"

This commit is contained in:
Zuul 2024-04-04 21:54:21 +00:00 committed by Gerrit Code Review
commit 8ea80c4b27
3 changed files with 102 additions and 39 deletions

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2022 Wind River Systems, Inc.
# Copyright (c) 2020-2024 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -36,12 +36,13 @@ TASK_NAME_PAUSE_AUDIT = 'pause'
INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES = [
# Secondary subclouds should not be audited as they are expected
# to be managed by a peer system controller (geo-redundancy feat.)
'create-complete',
'pre-rehome',
'rehome-failed',
'rehome-pending',
'rehoming',
'secondary',
'secondary-failed',
'rehome-pending',
'pre-rehome',
'rehoming',
'rehome-failed'
]
cert_mon_opts = [
@ -118,12 +119,19 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
# Do nothing if it is not systemcontroller
return
all_subclouds = utils.get_subclouds()[:]
all_subclouds = utils.get_subclouds_from_dcmanager(
self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES
)
LOG.info("Periodic: begin subcloud certificate audit: %d subclouds"
% len(all_subclouds))
for subcloud_name in all_subclouds:
self.sc_audit_queue.enqueue(
subcloud_audit_queue.SubcloudAuditData(subcloud_name))
for sc in all_subclouds:
try:
self.sc_audit_queue.enqueue(
subcloud_audit_queue.SubcloudAuditData(sc['name']))
except subcloud_audit_queue.SubcloudAuditException as exc:
# Log as warn because we can see this if the watch has fired
# near the same time as we are auditing the subcloud
LOG.warn("Failed to enqueue subcloud audit: %s", str(exc))
def on_start_audit(self):
"""
@ -136,14 +144,18 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
return
if CONF.certmon.startup_audit_all:
LOG.info("Service start: audit all subclouds")
LOG.info("Service start startup_audit_all: audit all subclouds")
self.audit_sc_cert_start(None)
return
LOG.info("Service start: begin subcloud certificate audit [batch: %s]"
% CONF.certmon.audit_batch_size)
all_subclouds = utils.get_subclouds_from_dcmanager(
self.token_cache.get_token())
self.token_cache.get_token(), INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES
)
LOG.info(
"Service start: begin subcloud certificate audit [#sc: %d, batch: %s]"
% (len(all_subclouds), CONF.certmon.audit_batch_size)
)
for subcloud in all_subclouds:
if subcloud[utils.ENDPOINT_TYPE_DC_CERT] != utils.SYNC_STATUS_IN_SYNC:
subcloud_name = subcloud['name']
@ -352,7 +364,8 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
self.dc_monitor = watcher.DC_CertWatcher()
self.dc_monitor.initialize(
audit_subcloud=lambda subcloud_name:
self.audit_subcloud(subcloud_name, allow_requeue=True))
self.audit_subcloud(subcloud_name, allow_requeue=True),
invalid_deploy_states=INVALID_SUBCLOUD_AUDIT_DEPLOY_STATES)
def init_restapicert_monitor(self):
self.restapicert_monitor = watcher.RestApiCert_CertWatcher()

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2023 Wind River Systems, Inc.
# Copyright (c) 2020-2024 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -204,28 +204,31 @@ def get_subcloud(token, subcloud_name):
return resp
def load_subclouds(resp):
def load_subclouds(resp, invalid_deploy_states=None):
sc_list = []
for obj in resp['subclouds']:
for obj in resp["subclouds"]:
if invalid_deploy_states and obj["deploy-status"] in invalid_deploy_states:
continue
sc = {}
sc['name'] = obj['name']
sc['management-state'] = obj['management-state']
sc['availability-status'] = obj['availability-status']
sc['sync_status'] = obj['sync_status']
for ss in obj['endpoint_sync_status']:
sc[ss['endpoint_type']] = ss['sync_status']
sc["name"] = obj["name"]
sc["region-name"] = obj["region-name"]
sc["management-state"] = obj["management-state"]
sc["availability-status"] = obj["availability-status"]
sc["sync_status"] = obj["sync_status"]
for ss in obj["endpoint_sync_status"]:
sc[ss["endpoint_type"]] = ss["sync_status"]
sc_list.append(sc)
return sc_list
def get_subclouds_from_dcmanager(token):
def get_subclouds_from_dcmanager(token, invalid_deploy_states=None):
api_url = dc_get_service_endpoint_url(token)
api_cmd = api_url + '/subclouds'
LOG.debug('api_cmd %s' % api_cmd)
resp = rest_api_request(token, "GET", api_cmd)
return load_subclouds(resp)
return load_subclouds(resp, invalid_deploy_states)
def is_subcloud_online(subcloud_name, token=None):
@ -239,6 +242,33 @@ def is_subcloud_online(subcloud_name, token=None):
return subcloud_info['availability-status'] == AVAILABILITY_ONLINE
def query_subcloud_online_with_deploy_state(
subcloud_name, invalid_deploy_states=None, token=None
):
"""Check if subcloud is online and not in an invalid deploy state"""
if not token:
token = get_token()
subcloud_info = get_subcloud(token, subcloud_name)
if not subcloud_info:
LOG.error("Cannot find subcloud %s" % subcloud_name)
return False, None, None
subcloud_valid_state = False
if (
invalid_deploy_states
and subcloud_info["deploy-status"] in invalid_deploy_states
):
subcloud_valid_state = False
else:
subcloud_valid_state = (
subcloud_info["availability-status"] == AVAILABILITY_ONLINE
)
return (
subcloud_valid_state,
subcloud_info["availability-status"],
subcloud_info["deploy-status"],
)
def update_subcloud_status(token, subcloud_name, status):
api_url = dc_get_service_endpoint_url(token)
api_cmd = api_url + '/subclouds/%s/update_status' % subcloud_name

View File

@ -1,4 +1,4 @@
# Copyright (c) 2020-2022 Wind River Systems, Inc.
# Copyright (c) 2020-2024 Wind River Systems, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -375,7 +375,7 @@ class DC_CertWatcher(CertWatcher):
def __init__(self):
super(DC_CertWatcher, self).__init__()
def initialize(self, audit_subcloud):
def initialize(self, audit_subcloud, invalid_deploy_states):
self.context.initialize()
dc_role = self.context.dc_role
LOG.info('DC role: %s' % dc_role)
@ -390,7 +390,11 @@ class DC_CertWatcher(CertWatcher):
self.context.kubernete_namespace = ns
self.register_listener(AdminEndpointRenew(self.context))
if dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
self.register_listener(DCIntermediateCertRenew(self.context, audit_subcloud))
self.register_listener(
DCIntermediateCertRenew(
self.context, audit_subcloud, invalid_deploy_states
)
)
self.register_listener(RootCARenew(self.context))
@ -520,26 +524,42 @@ class AdminEndpointRenew(CertificateRenew):
class DCIntermediateCertRenew(CertificateRenew):
def __init__(self, context, audit_subcloud):
def __init__(self, context, audit_subcloud, invalid_deploy_states):
super(DCIntermediateCertRenew, self).__init__(context)
self.invalid_deploy_states = invalid_deploy_states
self.secret_pattern = re.compile('-adminep-ca-certificate$')
self.audit_subcloud = audit_subcloud
def check_filter(self, event_data):
m = self.secret_pattern.search(event_data.secret_name)
if m and m.start() > 0:
# Ensure subcloud is online (watch events can fire
# for secrets before the subcloud first comes online)
search_result = self.secret_pattern.search(event_data.secret_name)
if search_result and search_result.start() > 0:
# Ensure subcloud is in a valid deploy-status and online (watch
# events can fire for secrets before the subcloud first comes online)
subcloud_name = self._get_subcloud_name(event_data)
try:
if not utils.is_subcloud_online(subcloud_name,
token=self.context.get_token()):
LOG.info('%s check_filter[%s]: subcloud is not online' %
(self.__class__.__name__, subcloud_name))
(
subcloud_valid_state,
availability_status,
deploy_status,
) = utils.query_subcloud_online_with_deploy_state(
subcloud_name,
invalid_deploy_states=self.invalid_deploy_states,
token=self.context.get_token(),
)
if not subcloud_valid_state:
LOG.info(
"%s check_filter: subcloud %s is ignored, "
"availability=%s, deploy_status: %s",
self.__class__.__name__,
subcloud_name,
availability_status,
deploy_status,
)
return False
except Exception:
LOG.exception('Failed to check subcloud availability: %s'
% subcloud_name)
LOG.exception(
"Failed to check subcloud availability: %s" % subcloud_name
)
return False
return self.certificate_is_ready(event_data)
else: