DC: ensure subcloud is online in watch event

As a subcloud is being bootstrapped, kubernetes secrets
are created before the subcloud is online.

As a result, the DCIntermediateCertRenew watch fires
well before the subcloud is available. Cert-mon attempts
to audit the subcloud as a result of the watch event,
and fails, causing the subcloud to go into a reattempt
loop. There is also some sort of issue with the keystone
token, causing 401 unauthorized errors when the subcloud
eventually does come online.

This behaviour is completely avoided if we simply
check that the subcloud is online before processing
the watch event. If the subloud is offline the event
is ignored - the subcloud will be properly audited
when cert-mon receives the subcloud online event
from dcmanager.

Story: 2008960
Task: 42969
Change-Id: I75751e24ae233c53bd00734a7d73d517868c87e8
Signed-off-by: Kyle MacLeod <kyle.macleod@windriver.com>
This commit is contained in:
Kyle MacLeod 2021-08-10 15:47:28 -04:00
parent 1df668d5c6
commit 2f1b1bcfb0
3 changed files with 32 additions and 3 deletions

View File

@ -121,6 +121,11 @@ class CertificateMonManager(periodic_task.PeriodicTasks):
len(self.subclouds_to_audit) - num_pause_tasks,
num_pause_tasks))
if not utils.is_subcloud_online(subcloud_name):
LOG.info("Subcloud is not online, aborting audit: %s" % subcloud_name)
self.subclouds_to_audit.pop(0)
return
try:
subcloud_sysinv_url = utils.dc_get_subcloud_sysinv_url(subcloud_name)
sc_ssl_cert = utils.get_endpoint_certificate(subcloud_sysinv_url)

View File

@ -275,6 +275,17 @@ def get_subclouds_from_dcmanager(token):
return load_subclouds(resp)
def is_subcloud_online(subcloud_name, token=None):
"""Check if subcloud is online"""
if not token:
token = get_token()
subcloud_info = get_subcloud(token, subcloud_name)
if not subcloud_info:
LOG.error('Cannot find subcloud %s' % subcloud_name)
return False
return subcloud_info['availability-status'] == AVAILABILITY_ONLINE
def update_subcloud_status(token, subcloud_name, status):
service_name = 'dcmanager'
api_url = dc_get_service_endpoint_url(constants.SYSTEM_CONTROLLER_REGION,

View File

@ -413,7 +413,7 @@ class AdminEndpointRenew(CertificateRenew):
role = self.context.dc_role
utils.update_admin_ep_cert(token, event_data.ca_crt, event_data.tls_crt,
event_data.tls_key)
event_data.tls_key)
# In subclouds, it was observed that sometimes old ICA was used
# to sign adminep-cert. Here we run a verification to confirm that
@ -431,6 +431,19 @@ class DCIntermediateCertRenew(CertificateRenew):
def check_filter(self, event_data):
m = self.secret_pattern.search(event_data.secret_name)
if m and m.start() > 0:
# Ensure subcloud is online (watch events can fire
# for secrets before the subcloud first comes online)
subcloud_name = self._get_subcloud_name(event_data)
try:
if not utils.is_subcloud_online(subcloud_name,
token=self.context.get_token()):
LOG.info('%s check_filter[%s]: subcloud is not online' %
(self.__class__.__name__, subcloud_name))
return False
except Exception:
LOG.exception('Failed to check subcloud availability: %s'
% subcloud_name)
return False
return self.certificate_is_ready(event_data)
else:
return False
@ -441,7 +454,7 @@ class DCIntermediateCertRenew(CertificateRenew):
def update_certificate(self, event_data):
subcloud_name = self._get_subcloud_name(event_data)
LOG.info('subcloud %s %s' % (subcloud_name, event_data))
LOG.info('update_certificate: subcloud %s %s' % (subcloud_name, event_data))
token = self.context.get_dc_token(subcloud_name)
subcloud_sysinv_url = utils.dc_get_subcloud_sysinv_url(subcloud_name)
@ -586,7 +599,7 @@ class PlatformCertRenew(CertificateRenew):
def check_filter(self, event_data):
LOG.debug('%s: Received event_data %s' % (self.secret_name, event_data))
if self.secret_name == event_data.secret_name:
LOG.info('%s check_filter[%s]: proceed on event_data: %s'
LOG.info('%s check_filter[%s], proceed on event_data: %s'
% (self.__class__.__name__, self.secret_name, event_data))
return self.certificate_is_ready(event_data)
else: