Stop fernet key rotations from raising out-of-sync alarms

Fernet key rotation is expected to occur periodically. Currently the 280.002 out-of-sync alarm is raised everytime the sync thread receives a fernet key rotation request. This commit makes the sync thread check the type of requests, setting the alarmable parameter to False if all the requests are due to a fernet key rotation. It also improves the sync function so it doesn't unnecessarily calls the is_subcloud_enable() function by providing an early exit when there are no pending sync requests. Test Plan: 1. PASS - Verify that the out-of-sync alarm is not raised when the fernet keys are rotated; 2. PASS - Check that the initial sync still works as expected; 3. PASS - Verify that identity sync due to user triggered identity resources change on the central cloud works as expected; 4. PASS - Check that platform resources sync due to user triggered platform resources on the central cloud works as expected; 5. PASS - Trigger a fernet key sync and at the same time trigger a different sync request and verify that alarm gets raised. Closes-Bug: #2002171 Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com> Change-Id: I694d6c3791222739921cd0f5141f54791f847414
2023-01-06 15:41:00 -03:00 · 2023-01-06 15:41:00 -03:00 · 18f54a44dd
commit 18f54a44dd
parent 4b05f5dcc0
2 changed files with 176 additions and 135 deletions
--- a/distributedcloud/dcmanager/rpc/client.py
+++ b/distributedcloud/dcmanager/rpc/client.py
@ -1,4 +1,4 @@
-# Copyright (c) 2017-2022 Wind River Systems, Inc.
+# Copyright (c) 2017-2023 Wind River Systems, Inc.
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
 # not use this file except in compliance with the License. You may obtain
 # a copy of the License at
@ -84,25 +84,29 @@ class SubcloudStateClient(RPCClient):
    def update_subcloud_endpoint_status(self, ctxt, subcloud_name=None,
                                        endpoint_type=None,
                                        sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
-                                        ignore_endpoints=None):
+                                        ignore_endpoints=None,
+                                        alarmable=True):
        # Note: This is an asynchronous operation.
        # See below for synchronous method call
        return self.cast(ctxt, self.make_msg('update_subcloud_endpoint_status',
                                             subcloud_name=subcloud_name,
                                             endpoint_type=endpoint_type,
                                             sync_status=sync_status,
-                                             ignore_endpoints=ignore_endpoints))
+                                             ignore_endpoints=ignore_endpoints,
+                                             alarmable=alarmable))

    def update_subcloud_endpoint_status_sync(self, ctxt, subcloud_name=None,
                                             endpoint_type=None,
                                             sync_status=dccommon_consts.SYNC_STATUS_OUT_OF_SYNC,
-                                             ignore_endpoints=None):
+                                             ignore_endpoints=None,
+                                             alarmable=True):
        # Note: synchronous
        return self.call(ctxt, self.make_msg('update_subcloud_endpoint_status',
                                             subcloud_name=subcloud_name,
                                             endpoint_type=endpoint_type,
                                             sync_status=sync_status,
-                                             ignore_endpoints=ignore_endpoints))
+                                             ignore_endpoints=ignore_endpoints,
+                                             alarmable=alarmable))


 class ManagerClient(RPCClient):
--- a/distributedcloud/dcorch/engine/sync_thread.py
+++ b/distributedcloud/dcorch/engine/sync_thread.py
@ -1,4 +1,4 @@
-# Copyright 2017-2022 Wind River
+# Copyright 2017-2023 Wind River
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -30,6 +30,7 @@ from dcorch.common import context
 from dcorch.common import exceptions
 from dcorch.common import utils
 from dcorch.db import api as db_api
+from dcorch.engine.fernet_key_manager import FERNET_REPO_MASTER_ID
 from dcorch.objects import orchrequest
 from dcorch.objects import resource
 from dcorch.objects.subcloud import Subcloud
@ -64,6 +65,12 @@ class SyncThread(object):
    """Manages tasks related to resource management."""

    MAX_RETRY = 3
+    PENDING_SYNC_REQUEST_STATES = (
+        consts.ORCH_REQUEST_QUEUED,
+        consts.ORCH_REQUEST_IN_PROGRESS,
+        consts.ORCH_REQUEST_FAILED,
+    )
+
    # used by the audit to cache the master resources
    master_resources_dict = collections.defaultdict(dict)

@ -261,7 +268,7 @@ class SyncThread(object):
            sync_request.orch_job.operation_type), extra=self.log_extra)
        handler(sync_request, rsrc)

-    def set_sync_status(self, sync_status):
+    def set_sync_status(self, sync_status, alarmable=True):
        # Only report sync_status when managed
        subcloud_managed = self.is_subcloud_managed()
        if not subcloud_managed:
@ -278,17 +285,23 @@ class SyncThread(object):
                subcloud_sync.sync_status_report_time, timeutils.utcnow())
            if delta < 3600:
                if subcloud_sync.sync_status_reported == sync_status:
-                    LOG.debug("skip set_sync_status sync_status_reported=%s, sync_status=%s " %
-                              (subcloud_sync.sync_status_reported, sync_status, ),
-                              extra=self.log_extra)
+                    LOG.debug(
+                        "skip set_sync_status sync_status_reported={}, "
+                        "sync_status={}".format(
+                            subcloud_sync.sync_status_reported, sync_status
+                        ),
+                        extra=self.log_extra,
+                    )
                    return

-        LOG.info("{}: set_sync_status {}".format(self.subcloud_name, sync_status),
+        LOG.info("{}: set_sync_status {}, alarmable: {}".format(
+                 self.subcloud_name, sync_status, alarmable),
                 extra=self.log_extra)

        self.dcmanager_state_rpc_client.update_subcloud_endpoint_status(
            self.ctxt, self.subcloud_name,
-            self.endpoint_type, sync_status)
+            self.endpoint_type, sync_status,
+            alarmable=alarmable)

        db_api.subcloud_sync_update(
            self.ctxt, self.subcloud_name, self.endpoint_type,
@ -300,151 +313,175 @@ class SyncThread(object):
                  extra=self.log_extra)
        region_name = self.subcloud_name
        sync_requests = []
-        # We want to check for pending work even if subcloud is disabled.

-        states = [
-            consts.ORCH_REQUEST_QUEUED,
-            consts.ORCH_REQUEST_IN_PROGRESS,
-            consts.ORCH_REQUEST_FAILED,
-        ]
        sync_requests = orchrequest.OrchRequestList.get_by_attrs(
            self.ctxt, self.endpoint_type,
            target_region_name=region_name,
-            states=states)
-        if(len(sync_requests) != 0):
-            LOG.info("Got " + str(len(sync_requests)) + " sync request(s)",
+            states=self.PENDING_SYNC_REQUEST_STATES)
+
+        # Early exit in case there are no pending sync requests
+        if not sync_requests:
+            LOG.info("Sync resources done for subcloud - "
+                     "no sync requests",
                     extra=self.log_extra)
+            self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
+            return
+
+        LOG.info(
+            "Got {} sync request(s)".format(len(sync_requests)),
+            extra=self.log_extra,
+        )
+
+        actual_sync_requests = []
+        alarmable = False
+        for req in sync_requests:
+            # Failed orch requests were taken into consideration when reporting
+            # sync status to the dcmanager. They need to be removed from the
+            # orch requests list before proceeding.
+            if req.state != consts.ORCH_REQUEST_STATE_FAILED:
+                actual_sync_requests.append(req)
+            else:
+                # Any failed state should be alarmable
+                alarmable = True
+
+            # Do not raise an alarm if all the sync requests are due to
+            # a fernet key rotation, as these are expected to occur
+            # periodically.
+            if req.orch_job.source_resource_id != FERNET_REPO_MASTER_ID:
+                alarmable = True
+
        # todo: for each request look up sync handler based on
        # resource type (I'm assuming here we're not storing a python
        # object in the DB)
        # Update dcmanager with the current sync status.
-        subcloud_enabled = self.is_subcloud_enabled()
-        if sync_requests:
-            self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
-            sync_status_start = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC
-        else:
-            self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
-            sync_status_start = dccommon_consts.SYNC_STATUS_IN_SYNC
+        self.set_sync_status(
+            dccommon_consts.SYNC_STATUS_OUT_OF_SYNC, alarmable=alarmable
+        )
+        sync_status_start = dccommon_consts.SYNC_STATUS_OUT_OF_SYNC

-        # Failed orch requests were taken into consideration when reporting
-        # sync status to the dcmanager. They need to be removed from the
-        # orch requests list before proceeding.
-        actual_sync_requests = \
-            [r for r in sync_requests if r.state != consts.ORCH_REQUEST_STATE_FAILED]
+        if not actual_sync_requests:
+            LOG.info("Sync resources done for subcloud - "
+                     "no valid sync requests",
+                     extra=self.log_extra)
+            return
+        elif not self.is_subcloud_enabled():
+            LOG.info("Sync resources done for subcloud - "
+                     "subcloud is disabled",
+                     extra=self.log_extra)
+            return

-        if not actual_sync_requests or not subcloud_enabled:
-            # Either there are no sync requests, or subcloud is disabled,
-            # or we timed out trying to talk to it.
-            # We're not going to process any sync requests, just go
-            # back to sleep.
-            if not subcloud_enabled:
-                LOG.info("subcloud is disabled", extra=self.log_extra)
-        else:
-            # Subcloud is enabled and there are pending sync requests, so
-            # we have work to do.
+        # Subcloud is enabled and there are pending sync requests, so
+        # we have work to do.
+        request_aborted = False
+        try:
+            for request in actual_sync_requests:
+                if not self.is_subcloud_enabled() or \
+                        self.should_exit():
+                        # Oops, someone disabled the endpoint while
+                        # we were processing work for it.
+                        raise exceptions.EndpointNotReachable()
+                request.state = consts.ORCH_REQUEST_STATE_IN_PROGRESS
+                try:
+                    request.save()  # save to DB
+                except exceptions.OrchRequestNotFound:
+                    # This case is handled in loop below, but should also be
+                    # handled here as well.
+                    LOG.info(
+                        "Orch request already deleted request uuid=%s state=%s"
+                        % (request.uuid, request.state),
+                        extra=self.log_extra,
+                    )
+                    continue

-            request_aborted = False
-            try:
-                for request in actual_sync_requests:
-                    if not self.is_subcloud_enabled() or \
-                            self.should_exit():
-                            # Oops, someone disabled the endpoint while
-                            # we were processing work for it.
-                            raise exceptions.EndpointNotReachable()
-                    request.state = consts.ORCH_REQUEST_STATE_IN_PROGRESS
+                retry_count = 0
+                while retry_count < self.MAX_RETRY:
                    try:
-                        request.save()  # save to DB
+                        self.sync_resource(request)
+                        # Sync succeeded, mark the request as
+                        # completed for tracking/debugging purpose
+                        # and tag it for purge when its deleted
+                        # time exceeds the data retention period.
+                        request.state = \
+                            consts.ORCH_REQUEST_STATE_COMPLETED
+                        request.deleted = 1
+                        request.deleted_at = timeutils.utcnow()
+                        request.save()
+                        break
                    except exceptions.OrchRequestNotFound:
-                        # This case is handled in loop below, but should also be
-                        # handled here as well.
-                        LOG.info("Orch request already deleted request uuid=%s state=%s" %
-                                 (request.uuid, request.state),
-                                 extra=self.log_extra)
-                        continue
-
-                    retry_count = 0
-                    while retry_count < self.MAX_RETRY:
-                        try:
-                            self.sync_resource(request)
-                            # Sync succeeded, mark the request as
-                            # completed for tracking/debugging purpose
-                            # and tag it for purge when its deleted
-                            # time exceeds the data retention period.
-                            request.state = \
-                                consts.ORCH_REQUEST_STATE_COMPLETED
-                            request.deleted = 1
-                            request.deleted_at = timeutils.utcnow()
-                            request.save()
-                            break
-                        except exceptions.OrchRequestNotFound:
-                            LOG.info("Orch request already deleted request uuid=%s state=%s" %
-                                     (request.uuid, request.state),
-                                     extra=self.log_extra)
-                            break
-                        except exceptions.SyncRequestTimeout:
-                            request.try_count += 1
-                            request.save()
-                            retry_count += 1
-                            if retry_count >= self.MAX_RETRY:
-                                # todo: raise "unable to sync this
-                                # subcloud/endpoint" alarm with fmapi
-                                raise exceptions.EndpointNotReachable()
-                        except exceptions.SyncRequestFailedRetry:
+                        LOG.info(
+                            "Orch request already deleted request uuid=%s state=%s"
+                            % (request.uuid, request.state),
+                            extra=self.log_extra,
+                        )
+                        break
+                    except exceptions.SyncRequestTimeout:
+                        request.try_count += 1
+                        request.save()
+                        retry_count += 1
+                        if retry_count >= self.MAX_RETRY:
                            # todo: raise "unable to sync this
                            # subcloud/endpoint" alarm with fmapi
-                            request.try_count += 1
-                            request.state = \
-                                consts.ORCH_REQUEST_STATE_FAILED
-                            request.save()
-                            retry_count += 1
-                            # we'll retry
-                        except exceptions.SyncRequestFailed:
-                            request.state = \
-                                consts.ORCH_REQUEST_STATE_FAILED
-                            request.save()
-                            retry_count = self.MAX_RETRY
-                        except exceptions.SyncRequestAbortedBySystem:
-                            request.state = \
-                                consts.ORCH_REQUEST_STATE_FAILED
-                            request.save()
-                            retry_count = self.MAX_RETRY
-                            request_aborted = True
+                            raise exceptions.EndpointNotReachable()
+                    except exceptions.SyncRequestFailedRetry:
+                        # todo: raise "unable to sync this
+                        # subcloud/endpoint" alarm with fmapi
+                        request.try_count += 1
+                        request.state = \
+                            consts.ORCH_REQUEST_STATE_FAILED
+                        request.save()
+                        retry_count += 1
+                        # we'll retry
+                    except exceptions.SyncRequestFailed:
+                        request.state = \
+                            consts.ORCH_REQUEST_STATE_FAILED
+                        request.save()
+                        retry_count = self.MAX_RETRY
+                    except exceptions.SyncRequestAbortedBySystem:
+                        request.state = \
+                            consts.ORCH_REQUEST_STATE_FAILED
+                        request.save()
+                        retry_count = self.MAX_RETRY
+                        request_aborted = True

-                    # If we fall out of the retry loop we either succeeded
-                    # or failed multiple times and want to move to the next
-                    # request.
+                # If we fall out of the retry loop we either succeeded
+                # or failed multiple times and want to move to the next
+                # request.

-            except exceptions.EndpointNotReachable:
-                # Endpoint not reachable, throw away all the sync requests.
-                LOG.info("EndpointNotReachable, {} sync requests pending"
-                         .format(len(actual_sync_requests)))
-                # del sync_requests[:] #This fails due to:
-                # 'OrchRequestList' object does not support item deletion
+        except exceptions.EndpointNotReachable:
+            # Endpoint not reachable, throw away all the sync requests.
+            LOG.info(
+                "EndpointNotReachable, {} sync requests pending".format(
+                    len(actual_sync_requests)), extra=self.log_extra)
+            # del sync_requests[:] #This fails due to:
+            # 'OrchRequestList' object does not support item deletion

-            sync_requests = orchrequest.OrchRequestList.get_by_attrs(
-                self.ctxt, self.endpoint_type,
-                target_region_name=region_name,
-                states=states)
+        sync_requests = orchrequest.OrchRequestList.get_by_attrs(
+            self.ctxt, self.endpoint_type,
+            target_region_name=region_name,
+            states=self.PENDING_SYNC_REQUEST_STATES)

-            if (sync_requests and
-                    sync_status_start != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC):
+        if (sync_requests and
+                sync_status_start != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC):
+            self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
+            LOG.info(
+                "End of resource sync out-of-sync. {} sync request(s)".format(
+                    len(sync_requests)), extra=self.log_extra)
+        elif sync_requests and request_aborted:
+            if sync_status_start != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
                self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
-                LOG.info("End of resource sync out-of-sync. " +
-                         str(len(sync_requests)) + " sync request(s)",
-                         extra=self.log_extra)
-            elif sync_requests and request_aborted:
-                if sync_status_start != dccommon_consts.SYNC_STATUS_OUT_OF_SYNC:
-                    self.set_sync_status(dccommon_consts.SYNC_STATUS_OUT_OF_SYNC)
-                LOG.info("End of resource sync out-of-sync. " +
-                         str(len(sync_requests)) + " sync request(s)" +
-                         ": request_aborted", extra=self.log_extra)
-            elif sync_status_start != dccommon_consts.SYNC_STATUS_IN_SYNC:
-                self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
-                LOG.info("End of resource sync in-sync. " +
-                         str(len(sync_requests)) + " sync request(s)",
-                         extra=self.log_extra)
+            LOG.info(
+                "End of resource sync out-of-sync. {} sync request(s): "
+                "request_aborted".format(len(sync_requests)),
+                extra=self.log_extra)
+        elif sync_status_start != dccommon_consts.SYNC_STATUS_IN_SYNC:
+            self.set_sync_status(dccommon_consts.SYNC_STATUS_IN_SYNC)
+            LOG.info(
+                "End of resource sync in-sync. {} sync request(s)".format(
+                    len(sync_requests)), extra=self.log_extra)

-        LOG.info("Sync resources done for subcloud", extra=self.log_extra)
+        LOG.info("Sync resources done for subcloud - "
+                 "synced {} request(s)".format(len(actual_sync_requests)),
+                 extra=self.log_extra)

    def run_sync_audit(self, engine_id=None):
        if self.endpoint_type in cfg.CONF.disable_audit_endpoints: