Integrate dcagent with dcmanager-audit

This commit integrates the newly created dcagent with dcmanager-audit. Currently, dcagent is disabled and will remain disabled until installed and enabled in subclouds. To enable, the return call in the function subcloud_has_dcagent in dccommon/utils needs to be removed, as well as removing the comments that check if dcagent should be called. NOTE: patch and load audits will be deprecated in the next major release, which is why they are not performed by dcagent. Until then, they will remain enabled. Test plan: - PASS: Run dcmanager audit with dcagent. Verify only one call is made to audit the subcloud and the response include the correct sync status. - PASS: Run dcmanager audit without dcagent. Verify the audit works as expected querying each individual endpoint. - PASS: Verify dcmanager can audit a subcloud with dcagent and another one with old audit workflow at the same time and verify both subclouds get to 'in-sync' status. - PASS: Induce a status where the sync status of the subclouds' endpoints change (changing cert-id of kube rootca). Verify that the audit and communication to dcmanager-state happen within the 30s audit interval. Story: 2011106 Task: 50560 Change-Id: Ic330a18daefeb6f12fa7589ae6ba82cafd30f298 Signed-off-by: Victor Romano <victor.gluzromano@windriver.com>
2024-07-02 00:56:00 -03:00
parent 0f15d27b36
commit ac33bc6af1
2 changed files with 164 additions and 5 deletions
--- a/distributedcloud/dccommon/utils.py
+++ b/distributedcloud/dccommon/utils.py
@@ -399,6 +399,13 @@ def send_subcloud_shutdown_signal(subcloud_name):
    rvmc.power_off(subcloud_name, rvmc_config_file, LOG)


+def subcloud_has_dcagent(software_version: str):
+    # TODO(vgluzrom): remove "False" condition and uncomment code
+    # when dcagent service is enabled by default
+    return False
+    # return software_version >= consts.MIN_VERSION_FOR_DCAGENT
+
+
 def log_subcloud_msg(
    log_func: Callable, msg: str, subcloud_name: str = None, avail_status: str = None
 ):
--- a/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py
+++ b/distributedcloud/dcmanager/audit/subcloud_audit_worker_manager.py
@@ -21,10 +21,13 @@ from oslo_config import cfg
 from oslo_log import log as logging

 from dccommon import consts as dccommon_consts
+from dccommon.drivers.openstack.dcagent_v1 import DcagentClient
 from dccommon.drivers.openstack.fm import FmClient
 from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
 from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
 from dccommon import endpoint_cache
+from dccommon.utils import log_subcloud_msg
+from dccommon.utils import subcloud_has_dcagent
 from dcmanager.audit import alarm_aggregation
 from dcmanager.audit import base_audit
 from dcmanager.audit import firmware_audit
@@ -269,6 +272,38 @@ class SubcloudAuditWorkerManager(manager.Manager):
            and first_identity_sync_complete
        )

+    def _build_dcagent_payload(
+        self,
+        subcloud_management_state,
+        subcloud_avail_status,
+        first_identity_sync_complete,
+        firmware_audit_data,
+        kubernetes_audit_data,
+        kube_rootca_update_audit_data,
+        software_audit_data,
+        do_firmware_audit,
+        do_kubernetes_audit,
+        do_kube_rootca_update_audit,
+        do_software_audit,
+    ):
+        audit_payload = {dccommon_consts.BASE_AUDIT: ""}
+        if self._should_perform_additional_audit(
+            subcloud_management_state,
+            subcloud_avail_status,
+            first_identity_sync_complete,
+        ):
+            if do_firmware_audit and firmware_audit_data:
+                audit_payload[dccommon_consts.FIRMWARE_AUDIT] = firmware_audit_data
+            if do_kubernetes_audit and kubernetes_audit_data:
+                audit_payload[dccommon_consts.KUBERNETES_AUDIT] = kubernetes_audit_data
+            if do_kube_rootca_update_audit and kube_rootca_update_audit_data:
+                audit_payload[dccommon_consts.KUBE_ROOTCA_AUDIT] = (
+                    kube_rootca_update_audit_data
+                )
+            if do_software_audit and software_audit_data:
+                audit_payload[dccommon_consts.SOFTWARE_AUDIT] = software_audit_data
+        return audit_payload
+
    def _audit_subcloud(
        self,
        subcloud: models.Subcloud,
@@ -297,14 +332,17 @@ class SubcloudAuditWorkerManager(manager.Manager):
        failures = list()
        availability_data = dict()
        endpoint_data = dict()
+        has_dcagent = subcloud_has_dcagent(subcloud.software_version)

        # Set defaults to None and disabled so we will still set disabled
        # status if we encounter an error.

        keystone_client = None
+        dcagent_client = None
        sysinv_client = None
        fm_client = None
        avail_to_set = dccommon_consts.AVAILABILITY_OFFLINE
+        failmsg = "Audit failure subcloud: %s, endpoint: %s"
        try:
            keystone_client = OpenStackDriver(
                region_name=subcloud_region,
@@ -312,6 +350,14 @@ class SubcloudAuditWorkerManager(manager.Manager):
                fetch_subcloud_ips=utils.fetch_subcloud_mgmt_ips
            ).keystone_client
            admin_session = keystone_client.session
+            if has_dcagent:
+                dcagent_client = DcagentClient(
+                    subcloud_region,
+                    admin_session,
+                    endpoint=endpoint_cache.build_subcloud_endpoint(
+                        subcloud_management_ip, "dcagent"
+                    ),
+                )
            sysinv_client = SysinvClient(
                subcloud_region, admin_session,
                endpoint=keystone_client.endpoint_cache.get_endpoint("sysinv")
@@ -363,8 +409,114 @@ class SubcloudAuditWorkerManager(manager.Manager):
            LOG.exception("Failed to create clients for subcloud: %s"
                          % subcloud_name)

-        # Check availability of the subcloud
-        if sysinv_client:
+        if has_dcagent and dcagent_client:
+            LOG.debug(f"Starting dcagent audit for subcloud: {subcloud_name}")
+            # If we don't have the audit data, we won't send the request to the
+            # dcagent service, so we set the status to "in sync"
+            if do_firmware_audit and not firmware_audit_data:
+                endpoint_data[dccommon_consts.ENDPOINT_TYPE_FIRMWARE] = (
+                    dccommon_consts.SYNC_STATUS_IN_SYNC
+                )
+                audits_done.append(dccommon_consts.ENDPOINT_TYPE_FIRMWARE)
+            if do_kubernetes_audit and not kubernetes_audit_data:
+                endpoint_data[dccommon_consts.ENDPOINT_TYPE_KUBERNETES] = (
+                    dccommon_consts.SYNC_STATUS_IN_SYNC
+                )
+                audits_done.append(dccommon_consts.ENDPOINT_TYPE_KUBERNETES)
+            if do_kube_rootca_update_audit and not kube_rootca_update_audit_data:
+                endpoint_data[dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA] = (
+                    dccommon_consts.SYNC_STATUS_IN_SYNC
+                )
+                audits_done.append(dccommon_consts.ENDPOINT_TYPE_KUBE_ROOTCA)
+            if do_software_audit and not software_audit_data:
+                endpoint_data[dccommon_consts.ENDPOINT_TYPE_SOFTWARE] = (
+                    dccommon_consts.SYNC_STATUS_IN_SYNC
+                )
+                audits_done.append(dccommon_consts.ENDPOINT_TYPE_SOFTWARE)
+            LOG.debug(f"Skipping following audits for subcloud {subcloud_name} because "
+                      f"RegionOne audit data is not available: {audits_done}")
+            audit_payload = self._build_dcagent_payload(
+                subcloud.management_state,
+                avail_status_current,
+                subcloud.first_identity_sync_complete,
+                firmware_audit_data,
+                kubernetes_audit_data,
+                kube_rootca_update_audit_data,
+                software_audit_data,
+                do_firmware_audit,
+                do_kubernetes_audit,
+                do_kube_rootca_update_audit,
+                do_software_audit,
+            )
+            audit_results = {}
+            try:
+                audit_results = dcagent_client.audit(audit_payload)
+            except Exception:
+                LOG.exception(failmsg % (subcloud.name, "dcagent"))
+                failures.append("dcagent")
+            LOG.debug(
+                f"Audits results for subcloud {subcloud_name}: "
+                f"{subcloud_name}: {audit_results}"
+            )
+            for audit_type, audit_value in audit_results.items():
+                if audit_type == dccommon_consts.BASE_AUDIT:
+                    avail_to_set = audit_value.get("availability")
+                    if avail_to_set == dccommon_consts.AVAILABILITY_OFFLINE:
+                        inactive_sg = audit_value.get("inactive_sg")
+                        msg = f"Inactive service groups: {inactive_sg}"
+                        log_subcloud_msg(LOG.debug, msg, subcloud_name, avail_to_set)
+                    alarms = audit_value.get("alarms")
+                    if alarms:
+                        self.alarm_aggr.update_alarm_summary(subcloud_name, alarms)
+                elif audit_value:
+                    endpoint_type = dccommon_consts.DCAGENT_ENDPOINT_TYPE_MAP[
+                        audit_type
+                    ]
+                    endpoint_data[endpoint_type] = audit_value
+                    audits_done.append(endpoint_type)
+            # Patch and load audits are not done in dcagent,
+            # so we need to do it separately
+            if self._should_perform_additional_audit(
+                subcloud.management_state,
+                avail_to_set,
+                subcloud.first_identity_sync_complete,
+            ):
+                if do_patch_audit and patch_audit_data:
+                    try:
+                        endpoint_data[dccommon_consts.ENDPOINT_TYPE_PATCHING] = (
+                            self.patch_audit.subcloud_patch_audit(
+                                keystone_client.session,
+                                sysinv_client,
+                                subcloud_management_ip,
+                                subcloud_name,
+                                subcloud_region,
+                                patch_audit_data,
+                            )
+                        )
+                        audits_done.append(dccommon_consts.ENDPOINT_TYPE_PATCHING)
+                    except Exception:
+                        LOG.exception(
+                            failmsg
+                            % (subcloud.name, dccommon_consts.ENDPOINT_TYPE_PATCHING)
+                        )
+                        failures.append(dccommon_consts.ENDPOINT_TYPE_PATCHING)
+                if do_load_audit and patch_audit_data:
+                    try:
+                        endpoint_data[dccommon_consts.ENDPOINT_TYPE_LOAD] = (
+                            self.patch_audit.subcloud_load_audit(
+                                sysinv_client, subcloud_name, patch_audit_data
+                            )
+                        )
+                        audits_done.append(dccommon_consts.ENDPOINT_TYPE_LOAD)
+                    except Exception:
+                        LOG.exception(
+                            failmsg
+                            % (subcloud.name, dccommon_consts.ENDPOINT_TYPE_LOAD)
+                        )
+                        failures.append(dccommon_consts.ENDPOINT_TYPE_LOAD)
+
+        # Check availability for subcloud that doesn't have dcagent
+        if not has_dcagent and sysinv_client:
            # Avoid a network call to sysinv here if possible:
            # If prestaging is active we can assume that the subcloud
            # is online (otherwise prestaging will fail):
@@ -422,9 +574,9 @@ class SubcloudAuditWorkerManager(manager.Manager):
                "audit_fail_count": None
            })

-        # If subcloud is managed and online and the identity was synced once,
-        # audit additional resources
-        if self._should_perform_additional_audit(
+        # If subcloud is managed, online, the identity was synced once
+        # and it doesn't have dcagent, audit additional resources
+        if not has_dcagent and self._should_perform_additional_audit(
            subcloud.management_state,
            avail_to_set,
            subcloud.first_identity_sync_complete,