Files
distcloud/distributedcloud/dccertmon/tests/common/test_notification_audit_queue.py
Hugo Brito de4790f539 Dccertmon cache cleanup
The previously introduced token caching mechanism [1] reduces token
requests to subclouds and also effectively handles Keystone endpoint
caching, as the endpoint catalog is cached alongside tokens.

This commit applies the endpoint_cache logic to DCCertmon.

[1] https://review.opendev.org/c/starlingx/distcloud/+/931830

Test Plan:
- PASS: Deploy a subcloud, manage it and verify that the dc-cert
        status is updated to in-sync.
- PASS: In the systemcontroller, delete the secret for the
        adminep-ca-certificate of a managed subcloud and verify that
        it is updated both on the system controller and the subcloud.
- PASS: Deploy a subcloud and wait for dccertmon to audit. Turn off
        the subcloud and wait an hour after turning it on. Verify it
        is audited correctly and the dc-cert status is updated to
        in-sync.

Story: 2011311
Task: 52382

Change-Id: If2d05a1e3b6ac0466b8a15c70c6b8ada851a009f
Signed-off-by: Hugo Brito <hugo.brito@windriver.com>
2025-06-23 11:14:44 -03:00

230 lines
8.0 KiB
Python

# Copyright (c) 2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import time
import eventlet
eventlet.monkey_patch(os=False)
# pylint: disable=wrong-import-position
from keystoneauth1 import session # noqa: E402
from dccommon.endpoint_cache import EndpointCache # noqa: E402
from dccertmon.common import ( # noqa: E402
certificate_monitor_manager as cert_mon_manager,
)
from dccertmon.common.service import CertificateMonitorService # noqa: E402
from dccertmon.common.subcloud_audit_queue import ( # noqa: E402
NotificationAuditQueue,
SubcloudAuditData,
SubcloudAuditException,
)
from dccertmon.tests.base import DCCertMonTestCase # noqa: E402
# pylint: enable=wrong-import-position
# Shared function for mocking certificate retrieval
audit_order = []
first_started = eventlet.event.Event()
AUDIT_SLEEP_TIME = 3
def slow_get_cert(*args, **kwargs):
"""Returns a side-effect function that simulates a delayed cert fetch.
Records the start time of the audit in `audit_order`.
If `first_started` is provided, it will be triggered once the first audit starts.
"""
ts = time.time()
audit_order.append(ts)
if len(audit_order) == 1:
first_started.send()
# Simulate long-running audit for holding the lock
eventlet.sleep(AUDIT_SLEEP_TIME)
return None
class NotificationAuditQueueTestCase(DCCertMonTestCase):
def setUp(self):
super().setUp()
self.queue = NotificationAuditQueue()
def tearDown(self):
self.queue = None
super().tearDown()
def test_enqueue_single(self):
item = SubcloudAuditData("subcloud1")
self.queue.enqueue(item)
self.assertEqual(self.queue.qsize(), 1)
self.assertTrue(self.queue.contains("subcloud1"))
def test_enqueue_duplicate_raises(self):
item = SubcloudAuditData("subcloud1")
self.queue.enqueue(item)
self.assertRaises(SubcloudAuditException, self.queue.enqueue, item)
def test_enqueue_with_timestamp_ordering(self):
items = [SubcloudAuditData(f"subcloud{i}") for i in range(3)]
timestamp = int(time.time())
self.queue.enqueue(items[2], timestamp=timestamp + 20)
self.queue.enqueue(items[0], timestamp=timestamp + 0)
self.queue.enqueue(items[1], timestamp=timestamp + 10)
first = self.queue.get()[1]
second = self.queue.get()[1]
third = self.queue.get()[1]
self.assertEqual(first.name, "subcloud0")
self.assertEqual(second.name, "subcloud1")
self.assertEqual(third.name, "subcloud2")
def test_contains_and_qsize(self):
self.assertFalse(self.queue.contains("subcloudX"))
self.assertEqual(self.queue.qsize(), 0)
item = SubcloudAuditData("subcloudX")
self.queue.enqueue(item)
self.assertTrue(self.queue.contains("subcloudX"))
self.assertEqual(self.queue.qsize(), 1)
class NotificationAuditBehaviorTestCase(DCCertMonTestCase):
def setUp(self):
super().setUp()
global audit_order, first_started
audit_order = []
first_started = eventlet.event.Event()
self.manager = cert_mon_manager.CertificateMonitorManager()
self.manager.sc_audit_pool = None # Force serial execution
self.service = CertificateMonitorService()
self.service.manager = self.manager
# Store common mocks as instance attributes
self.mock_get_subcloud = self._mock_object(
cert_mon_manager.utils, "get_subcloud"
)
self.mock_is_subcloud_online = self._mock_object(
cert_mon_manager.utils, "is_subcloud_online"
)
self._mock_object(EndpointCache, "get_admin_session")
self.mock_get_token = self._mock_object(session.Session, "get_token")
self.mock_slow_get_cert = self._mock_object(
cert_mon_manager.utils, "get_endpoint_certificate"
)
self.mock_get_subcloud.return_value = {
"name": "subcloud",
"deploy-status": "complete",
"availability-status": "online",
"management-start-ip": "1.2.3.4",
}
self.mock_is_subcloud_online.return_value = True
self.mock_get_token.return_value = "fake-token"
def test_subcloud_added_to_notification_queue(self):
"""Ensure subcloud is enqueued when marked online."""
subcloud = "subcloud1"
self.assertFalse(self.manager.sc_notify_audit_queue.contains(subcloud))
self.manager.audit_subcloud(subcloud, self.manager.sc_notify_audit_queue)
self.assertTrue(self.manager.sc_notify_audit_queue.contains(subcloud))
def test_failed_audit_requeues_with_delay(self):
"""Ensure that an audit failure requeues the subcloud with delay."""
subcloud = "subcloud2"
audit_data = SubcloudAuditData(subcloud)
self.manager.sc_notify_audit_queue.enqueue(audit_data)
# Patch internal utils to simulate failure in cert retrieval
self.mock_slow_get_cert.side_effect = Exception("fail")
_, item = self.manager.sc_notify_audit_queue.get()
self.manager._subcloud_audit(self.manager.sc_notify_audit_queue, item)
# The item should have been re-enqueued with now+60
self.assertTrue(self.manager.sc_notify_audit_queue.contains(subcloud))
next_timestamp, _ = self.manager.sc_notify_audit_queue.queue[0]
now = int(time.time())
self.assertGreaterEqual(next_timestamp, now + 59)
def test_audit_same_subclouds_is_serialized(self):
"""Ensure audits for the same subcloud run sequentially using the lock."""
subcloud = "subcloud-lock-test"
item1 = SubcloudAuditData(subcloud)
item2 = SubcloudAuditData(subcloud)
self.manager.sc_audit_queue.enqueue(item1)
self.manager.sc_notify_audit_queue.enqueue(item2)
self.mock_slow_get_cert.side_effect = slow_get_cert
# Spawn first audit and wait until it starts (acquires the lock)
t1 = eventlet.spawn(
self.manager.do_subcloud_audit, self.manager.sc_audit_queue, item1
)
first_started.wait(timeout=5)
# Spawn second audit while first is still holding the lock
t2 = eventlet.spawn(
self.manager.do_subcloud_audit,
self.manager.sc_notify_audit_queue,
item2,
)
eventlet.sleep(1)
# Assert that the second audit hasn't started yet
self.assertEqual(
len(audit_order),
1,
"Second audit should still be waiting for the lock",
)
# Wait for both audits to complete
t1.wait()
t2.wait()
self.assertEqual(len(audit_order), 2, "Both audits should have run")
self.assertLess(
audit_order[0] + AUDIT_SLEEP_TIME,
audit_order[1],
"Second audit should have started after the first released the lock",
)
def test_audit_different_subclouds_run_concurrently(self):
"""Ensure audits for different subclouds are not blocked by the lock."""
subcloud1 = "subcloud-lock-test1"
subcloud2 = "subcloud-lock-test2"
item1 = SubcloudAuditData(subcloud1)
item2 = SubcloudAuditData(subcloud2)
self.manager.sc_audit_queue.enqueue(item1)
self.manager.sc_notify_audit_queue.enqueue(item2)
self.mock_slow_get_cert.side_effect = slow_get_cert
# Spawn both audits for different subclouds simultaneously
t1 = eventlet.spawn(
self.manager.do_subcloud_audit, self.manager.sc_audit_queue, item1
)
t2 = eventlet.spawn(
self.manager.do_subcloud_audit,
self.manager.sc_notify_audit_queue,
item2,
)
eventlet.sleep(1)
# Both audits should have started within a short time
self.assertEqual(len(audit_order), 2, "Both audits should have started")
t1.wait()
t2.wait()
self.assertLess(
abs(audit_order[0] - audit_order[1]),
AUDIT_SLEEP_TIME,
"Audits for different subclouds should run concurrently",
)