306 lines
12 KiB
Python
306 lines
12 KiB
Python
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# Copyright (c) 2020 Wind River Systems, Inc.
|
|
#
|
|
# The right to copy, distribute, modify, or otherwise make use
|
|
# of this software may be licensed only pursuant to the terms
|
|
# of an applicable Wind River license agreement.
|
|
#
|
|
from eventlet import greenthread
|
|
import greenlet
|
|
from oslo_config import cfg
|
|
from oslo_log import log
|
|
from oslo_serialization import base64
|
|
from oslo_service import periodic_task
|
|
import time
|
|
|
|
from sysinv.cert_mon import watcher
|
|
from sysinv.cert_mon import utils
|
|
from sysinv.common import constants
|
|
from sysinv.common import utils as cutils
|
|
|
|
LOG = log.getLogger(__name__)
|
|
TASK_NAME_PAUSE_AUDIT = 'pause'
|
|
|
|
cert_mon_opts = [
|
|
cfg.IntOpt('audit_interval',
|
|
default=86400, # 24 hours
|
|
help='Interval to run certificate audit'),
|
|
cfg.IntOpt('retry_interval',
|
|
default=10 * 60, # retry every 10 minutes
|
|
help='interval to reattempt accessing external system '
|
|
'if failure occurred'),
|
|
cfg.IntOpt('max_retry',
|
|
default=14, # retry 14 times to give at least 2 hours to recover
|
|
help='interval to reattempt accessing external system '
|
|
'if failure occurred'),
|
|
]
|
|
|
|
CONF = cfg.CONF
|
|
CONF.register_opts(cert_mon_opts, 'certmon')
|
|
|
|
|
|
class CertificateMonManager(periodic_task.PeriodicTasks):
|
|
def __init__(self):
|
|
super(CertificateMonManager, self).__init__(CONF)
|
|
self.mon_threads = []
|
|
self.audit_thread = None
|
|
self.dc_monitor = None
|
|
self.restapicert_monitor = None
|
|
self.registrycert_monitor = None
|
|
self.reattempt_tasks = []
|
|
self.subclouds_to_audit = []
|
|
|
|
def periodic_tasks(self, context, raise_on_error=False):
|
|
"""Tasks to be run at a periodic interval."""
|
|
return self.run_periodic_tasks(context, raise_on_error=raise_on_error)
|
|
|
|
@periodic_task.periodic_task(spacing=CONF.certmon.audit_interval)
|
|
def audit_sc_cert_start(self, context):
|
|
"""Kicks an audit of all subclouds.
|
|
This task runs every very long period of time, such as 24 hours.
|
|
"""
|
|
# auditing subcloud certificate
|
|
dc_role = utils.get_dc_role()
|
|
if dc_role != constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
|
|
# Do nothing if it is not systemcontroller
|
|
return
|
|
|
|
self.subclouds_to_audit = utils.get_subclouds()[:]
|
|
LOG.info("Periodic: begin subcloud certificate audit: %d subclouds" % len(self.subclouds_to_audit))
|
|
|
|
def on_start_audit(self):
|
|
"""
|
|
On service start audit
|
|
Audit all subclouds that are out-of-sync
|
|
"""
|
|
dc_role = utils.get_dc_role()
|
|
if dc_role != constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER:
|
|
# Do nothing if it is not systemcontroller
|
|
return
|
|
|
|
LOG.info("Service start: begin subcloud certificate audit")
|
|
number_of_sc_to_audit = 0
|
|
token = utils.get_token()
|
|
subclouds = utils.get_subclouds_from_dcmanager(token)
|
|
for sc in subclouds:
|
|
if sc[utils.ENDPOINT_TYPE_DC_CERT] == utils.SYNC_STATUS_OUT_OF_SYNC:
|
|
self.subclouds_to_audit.append(sc['name'])
|
|
LOG.info('%s is out-of-sync, adding it to audit.' % sc['name'])
|
|
number_of_sc_to_audit = number_of_sc_to_audit + 1
|
|
|
|
if number_of_sc_to_audit > 0:
|
|
LOG.info('%d subcloud(s) found out-of-sync to be audited' %
|
|
number_of_sc_to_audit)
|
|
else:
|
|
LOG.info('All subclouds are in-sync. No startup audit is required')
|
|
|
|
@periodic_task.periodic_task(spacing=5)
|
|
def audit_sc_cert_task(self, context):
|
|
if len(self.subclouds_to_audit) > 0:
|
|
subcloud_name = self.subclouds_to_audit[0]
|
|
if subcloud_name == TASK_NAME_PAUSE_AUDIT:
|
|
LOG.info('Pause audit for ongoing update to complete')
|
|
self.subclouds_to_audit.pop(0)
|
|
return
|
|
|
|
num_pause_tasks = self.subclouds_to_audit.count(TASK_NAME_PAUSE_AUDIT)
|
|
LOG.info('Auditing subcloud %s [#subclouds: %d #pause: %d]'
|
|
% (subcloud_name,
|
|
len(self.subclouds_to_audit) - num_pause_tasks,
|
|
num_pause_tasks))
|
|
|
|
if not utils.is_subcloud_online(subcloud_name):
|
|
LOG.info("Subcloud is not online, aborting audit: %s" % subcloud_name)
|
|
self.subclouds_to_audit.pop(0)
|
|
return
|
|
|
|
try:
|
|
subcloud_sysinv_url = utils.dc_get_subcloud_sysinv_url(subcloud_name)
|
|
sc_ssl_cert = utils.get_endpoint_certificate(subcloud_sysinv_url)
|
|
|
|
secret = utils.get_sc_intermediate_ca_secret(subcloud_name)
|
|
check_list = ['ca.crt', 'tls.crt', 'tls.key']
|
|
for item in check_list:
|
|
if item not in secret.data:
|
|
raise Exception('%s certificate data missing: %s'
|
|
% (subcloud_name, item))
|
|
|
|
txt_ssl_cert = base64.decode_as_text(secret.data['tls.crt'])
|
|
txt_ssl_key = base64.decode_as_text(secret.data['tls.key'])
|
|
txt_ca_cert = base64.decode_as_text(secret.data['ca.crt'])
|
|
except Exception as e:
|
|
LOG.error('Cannot audit ssl certificate on %s' % subcloud_name)
|
|
LOG.exception(e)
|
|
# certificate is not ready, no reaudit. Will be picked up
|
|
# by certificate MODIFIED event if it comes back
|
|
self.subclouds_to_audit.pop(0)
|
|
return
|
|
|
|
cert_chain = txt_ssl_cert + txt_ca_cert
|
|
dc_token = utils.get_dc_token(subcloud_name)
|
|
if not cutils.verify_intermediate_ca_cert(cert_chain, sc_ssl_cert):
|
|
# The subcloud needs renewal.
|
|
LOG.info('Updating {} intermediate CA as it is out-of-sync'.format(subcloud_name))
|
|
# move the subcloud to the end of the queue for reauditing
|
|
self.requeue_audit(subcloud_name)
|
|
|
|
utils.update_subcloud_status(dc_token, subcloud_name,
|
|
utils.SYNC_STATUS_OUT_OF_SYNC)
|
|
try:
|
|
utils.update_subcloud_ca_cert(dc_token,
|
|
subcloud_name,
|
|
subcloud_sysinv_url,
|
|
txt_ca_cert,
|
|
txt_ssl_cert,
|
|
txt_ssl_key)
|
|
except Exception:
|
|
LOG.exception('Failed to update intermediate CA on %s' % subcloud_name)
|
|
else:
|
|
LOG.info('%s intermediate CA cert is in-sync' % subcloud_name)
|
|
utils.update_subcloud_status(dc_token, subcloud_name,
|
|
utils.SYNC_STATUS_IN_SYNC)
|
|
|
|
self.subclouds_to_audit.remove(subcloud_name)
|
|
|
|
@periodic_task.periodic_task(spacing=CONF.certmon.retry_interval)
|
|
def retry_task(self, context):
|
|
# Failed tasks that need to be reattempted will be taken care here
|
|
max_attempts = CONF.certmon.max_retry
|
|
tasks = self.reattempt_tasks[:]
|
|
|
|
num_tasks = len(tasks)
|
|
if num_tasks > 0:
|
|
LOG.info('Starting retry_task: #tasks in reattempt queue: %s' % num_tasks)
|
|
|
|
for task in tasks:
|
|
if task.run():
|
|
self.reattempt_tasks.remove(task)
|
|
LOG.info('Reattempt has succeeded')
|
|
elif task.number_of_reattempt >= max_attempts:
|
|
LOG.error('Maximum attempts (%s) has been reached. Give up' %
|
|
max_attempts)
|
|
if task in self.reattempt_tasks:
|
|
self.reattempt_tasks.remove(task)
|
|
|
|
# task has failed
|
|
task.failed()
|
|
|
|
def start_audit(self):
|
|
LOG.info('Auditing interval %s' % CONF.certmon.audit_interval)
|
|
utils.init_keystone_auth_opts()
|
|
self.audit_thread = greenthread.spawn(self.audit_cert)
|
|
self.on_start_audit()
|
|
|
|
def init_dc_monitor(self):
|
|
self.dc_monitor = watcher.DC_CertWatcher()
|
|
self.dc_monitor.initialize(
|
|
audit_subcloud=lambda subcloud_name: self.requeue_audit(subcloud_name))
|
|
|
|
def init_restapicert_monitor(self):
|
|
self.restapicert_monitor = watcher.RestApiCert_CertWatcher()
|
|
self.restapicert_monitor.initialize()
|
|
|
|
def init_registrycert_monitor(self):
|
|
self.registrycert_monitor = watcher.RegistryCert_CertWatcher()
|
|
self.registrycert_monitor.initialize()
|
|
|
|
def start_monitor(self):
|
|
utils.init_keystone_auth_opts()
|
|
dc_role = utils.get_dc_role()
|
|
while True:
|
|
try:
|
|
# init platform cert monitors
|
|
self.init_restapicert_monitor()
|
|
self.init_registrycert_monitor()
|
|
|
|
# init dc monitor only if running in DC role
|
|
if (dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER or
|
|
dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SUBCLOUD):
|
|
self.init_dc_monitor()
|
|
except Exception as e:
|
|
LOG.exception(e)
|
|
time.sleep(5)
|
|
else:
|
|
break
|
|
|
|
# spawn threads (DC thread spawned only if running in DC role)
|
|
self.mon_threads.append(greenthread.spawn(self.monitor_cert, self.restapicert_monitor))
|
|
self.mon_threads.append(greenthread.spawn(self.monitor_cert, self.registrycert_monitor))
|
|
|
|
if (dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SYSTEMCONTROLLER or
|
|
dc_role == constants.DISTRIBUTED_CLOUD_ROLE_SUBCLOUD):
|
|
self.mon_threads.append(greenthread.spawn(self.monitor_cert, self.dc_monitor))
|
|
|
|
def stop_monitor(self):
|
|
for mon_thread in self.mon_threads:
|
|
mon_thread.kill()
|
|
mon_thread.wait()
|
|
|
|
def stop_audit(self):
|
|
if self.audit_thread:
|
|
self.audit_thread.kill()
|
|
self.audit_thread.wait()
|
|
|
|
def audit_cert(self):
|
|
while True:
|
|
try:
|
|
self.run_periodic_tasks(context=None)
|
|
time.sleep(1)
|
|
except greenlet.GreenletExit:
|
|
break
|
|
except Exception as e:
|
|
LOG.exception(e)
|
|
|
|
def monitor_cert(self, monitor):
|
|
while True:
|
|
# never exit until exit signal received
|
|
try:
|
|
monitor.start_watch(
|
|
on_success=lambda task_id: self._purge_reattempt_task(task_id, 'on success'),
|
|
on_error=lambda task: self._add_reattempt_task(task),
|
|
)
|
|
except greenlet.GreenletExit:
|
|
break
|
|
except Exception as e:
|
|
# A bug somewhere?
|
|
# It shouldn't fall to here, but log and restart if it did
|
|
LOG.exception(e)
|
|
|
|
def _add_reattempt_task(self, task):
|
|
id = task.get_id()
|
|
self._purge_reattempt_task(id, 'for new reattempt')
|
|
self.reattempt_tasks.append(task)
|
|
|
|
def _purge_reattempt_task(self, id, reason_msg):
|
|
for t in self.reattempt_tasks:
|
|
if t.get_id() == id:
|
|
self.reattempt_tasks.remove(t)
|
|
LOG.info('Purging reattempt task %s: %s' % (reason_msg, id))
|
|
break
|
|
|
|
def requeue_audit(self, subcloud_name):
|
|
# move the subcloud to the end of the queue for auditing
|
|
# adding enough spaces so that the renewal would complete by
|
|
# next audit
|
|
if subcloud_name in self.subclouds_to_audit:
|
|
self.subclouds_to_audit.remove(subcloud_name)
|
|
for i in range(12, self.subclouds_to_audit.count(TASK_NAME_PAUSE_AUDIT), -1):
|
|
self.subclouds_to_audit.append(TASK_NAME_PAUSE_AUDIT)
|
|
self.subclouds_to_audit.append(subcloud_name)
|
|
|
|
def audit_subcloud(self, subcloud_name):
|
|
if subcloud_name not in self.subclouds_to_audit:
|
|
self.subclouds_to_audit.append(subcloud_name)
|