Improve subcloud endpoint update

This commit removes the dcorch endpoint update call during network
reconfiguration. After the dcorch integration with the
OptimizedOpenStackDriver [1], the subcloud endpoints are not cached
anymore and instead are built using the subcloud management IP. So the
OptimizedOpenStackDriver is only used to get the endpoints of the
master region (RegionOne), which doesn't change during network
reconfiguration.

This commit also fixes an issue where the dcmanager-manager service's
own endpoint cache was not updated during network reconfiguration.

It also fixes or change the OptimizedOpenStack driver usage in a few
places that were not modified in previous commits.

Test Plan:
1. PASS - To test platform sync, run dns-modify with and without
          --os-region-name SystemController and verify that when using
          the option, dcorch-proxy trigger a sync request immediatelly
          and that without the option it still gets triggered by dcorch
          audit. Verify that the DNS change correctly propagated to the
          subcloud;
2. PASS - To test identity sync, repeat the previous test but instead
          of changing the DNS, create a new user using the 'openstack
          user create/delete' command;
3. PASS - To test the removal of the endpoint update, modify the DNS
          without '--os-region-name SystemController', run the network
          reconfig and verify that the dcorch sync uses the new IP and
          the DNS changes are correctly synced with the subcloud;
4. PASS - Run a fernet key rotation test and verify that it still works
          as expected.

[1]: https://review.opendev.org/c/starlingx/distcloud/+/919959

Story: 2011106
Task: 50417

Change-Id: Ic06c6f6cfb12b440e36f72eb29bbffaf8988b921
Signed-off-by: Gustavo Herzmann <gustavo.herzmann@windriver.com>
This commit is contained in:
Gustavo Herzmann 2024-06-24 09:40:09 -03:00
parent 9d87796b50
commit 3d342a2a88
13 changed files with 69 additions and 91 deletions

View File

@ -606,9 +606,10 @@ class OptimizedEndpointCache(object):
)
return session.Session(auth=auth)
@classmethod
@lockutils.synchronized(LOCK_NAME)
def update_master_service_endpoint_region(
self, region_name: str, endpoint_values: dict
cls, region_name: str, endpoint_values: dict
) -> None:
"""Update the master endpoint map for a specific region.
@ -622,12 +623,13 @@ class OptimizedEndpointCache(object):
f"{region_name} with endpoints: {endpoint_values}"
)
# Update the current endpoint map
OptimizedEndpointCache.master_service_endpoint_map[region_name] = (
endpoint_values
)
if OptimizedEndpointCache.master_service_endpoint_map:
OptimizedEndpointCache.master_service_endpoint_map[region_name] = (
endpoint_values
)
# Update the cached subcloud endpoit map
if OptimizedEndpointCache.subcloud_endpoints and not self._is_central_cloud(
if OptimizedEndpointCache.subcloud_endpoints and not cls._is_central_cloud(
region_name
):
LOG.debug(

View File

@ -26,7 +26,7 @@ from dccommon.drivers.openstack.sdk_platform import (
OptimizedOpenStackDriver as OpenStackDriver
)
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dccommon.endpoint_cache import build_subcloud_endpoint
from dccommon import endpoint_cache
from dcmanager.audit import alarm_aggregation
from dcmanager.audit import firmware_audit
from dcmanager.audit import kube_rootca_update_audit
@ -150,20 +150,11 @@ class SubcloudAuditWorkerManager(manager.Manager):
do_software_audit)
def update_subcloud_endpoints(self, context, subcloud_name, endpoints):
try:
LOG.info("Updating service endpoints for subcloud %s "
"in endpoint cache" % subcloud_name)
endpoint_cache = OpenStackDriver(
region_name=dccommon_consts.CLOUD_0,
fetch_subcloud_ips=utils.fetch_subcloud_mgmt_ips,
).keystone_client.endpoint_cache
endpoint_cache.update_master_service_endpoint_region(
subcloud_name, endpoints)
except (keystone_exceptions.EndpointNotFound,
keystone_exceptions.ConnectFailure,
IndexError):
LOG.error("Failed to update the service endpoints "
"for subcloud %s." % subcloud_name)
LOG.info(f"Updating service endpoints for subcloud {subcloud_name} "
"in endpoint cache")
endpoint_cache.OptimizedEndpointCache.update_master_service_endpoint_region(
subcloud_name, endpoints
)
def _update_subcloud_audit_fail_count(self, subcloud,
audit_fail_count):
@ -371,9 +362,11 @@ class SubcloudAuditWorkerManager(manager.Manager):
).keystone_client
admin_session = keystone_client.session
sysinv_client = SysinvClient(
subcloud_region, admin_session, endpoint=build_subcloud_endpoint(
subcloud_region,
admin_session,
endpoint=endpoint_cache.build_subcloud_endpoint(
subcloud_management_ip, "sysinv"
)
),
)
fm_client = FmClient(subcloud_region, admin_session)
except keystone_exceptions.ConnectTimeout:

View File

@ -42,10 +42,8 @@ import tsconfig.tsconfig as tsc
import yaml
from dccommon import consts as dccommon_consts
# TODO(gherzman): Replace OpenStackDriver with OptimizedOpenStackDriver
# during dcmanager OptimizedOpenStackDriver integration
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
from dccommon.drivers.openstack.sdk_platform import OptimizedOpenStackDriver
from dccommon.drivers.openstack.sdk_platform import OptimizedOpenStackDriver as \
OpenStackDriver
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dccommon.drivers.openstack import vim
from dccommon import exceptions as dccommon_exceptions
@ -1048,8 +1046,11 @@ def is_subcloud_healthy(subcloud_region):
system_health = ""
try:
os_client = OpenStackDriver(region_name=subcloud_region,
region_clients=None)
os_client = OpenStackDriver(
region_name=subcloud_region,
region_clients=None,
fetch_subcloud_ips=fetch_subcloud_mgmt_ips,
)
keystone_client = os_client.keystone_client
endpoint = keystone_client.endpoint_cache.get_endpoint('sysinv')
sysinv_client = SysinvClient(subcloud_region,
@ -1078,7 +1079,7 @@ def is_subcloud_healthy(subcloud_region):
def get_systemcontroller_installed_loads():
try:
os_client = OptimizedOpenStackDriver(
os_client = OpenStackDriver(
region_name=dccommon_consts.SYSTEM_CONTROLLER_NAME,
region_clients=None,
fetch_subcloud_ips=fetch_subcloud_mgmt_ips,
@ -1359,7 +1360,7 @@ def decode_and_normalize_passwd(input_passwd):
def get_failure_msg(subcloud_region):
try:
os_client = OptimizedOpenStackDriver(
os_client = OpenStackDriver(
region_name=subcloud_region,
region_clients=None,
fetch_subcloud_ips=fetch_subcloud_mgmt_ips,
@ -1584,7 +1585,9 @@ def validate_name(name, prohibited_name_list=[],
def get_local_system():
m_ks_client = OpenStackDriver(
region_name=dccommon_consts.DEFAULT_REGION_NAME,
region_clients=None).keystone_client
region_clients=None,
fetch_subcloud_ips=fetch_subcloud_mgmt_ips,
).keystone_client
endpoint = m_ks_client.endpoint_cache.get_endpoint('sysinv')
sysinv_client = SysinvClient(dccommon_consts.DEFAULT_REGION_NAME,
m_ks_client.session,

View File

@ -47,6 +47,7 @@ from dccommon.drivers.openstack.sdk_platform import (
OptimizedOpenStackDriver as OpenStackDriver
)
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dccommon.endpoint_cache import OptimizedEndpointCache as EndpointCache
from dccommon.exceptions import PlaybookExecutionFailed
from dccommon.exceptions import SubcloudNotFound
from dccommon import kubeoperator
@ -3435,11 +3436,7 @@ class SubcloudManager(manager.Manager):
# Update service URLs in subcloud endpoint cache
self.audit_rpc_client.trigger_subcloud_endpoints_update(
context, subcloud_region, services_endpoints)
# TODO(gherzm): Remove the update_subcloud_endpoints call once
# the OpenStackDriver is moved to be used only in the master process
self.dcorch_rpc_client.update_subcloud_endpoints(
context, subcloud_region, services_endpoints)
# Update the management ip inside dcorch database
# Update the management ip inside dcorch database (triggers endpoint update)
self.dcorch_rpc_client.update_subcloud_management_ip(
context, subcloud_region, endpoint_ip)
# Update sysinv URL in cert-mon cache
@ -3447,6 +3444,11 @@ class SubcloudManager(manager.Manager):
dc_notification.subcloud_sysinv_endpoint_update(
context, subcloud_region, services_endpoints.get("sysinv"))
# Update dcmanager endpoint cache
EndpointCache.update_master_service_endpoint_region(
subcloud_region, services_endpoints
)
def _create_subcloud_update_overrides_file(
self, payload, subcloud_name, filename_suffix):
update_overrides_file = os.path.join(

View File

@ -2353,12 +2353,6 @@ class TestSubcloudsPatchPrestage(BaseTestSubcloudsPatch):
def test_patch_prestage_fails_with_invalid_release(self):
"""Test patch prestage fails with invalid release"""
# TODO(gherzmann): Remove the following mock when the OpenStackDriver
# is fully replaced by OptimizedOpenStackDriver
mock_patch_object = mock.patch.object(cutils, 'OptimizedOpenStackDriver')
self.mock_openstack_driver = mock_patch_object.start()
self.addCleanup(mock_patch_object.stop)
self.params["release"] = "21.12"
self.mock_sysinv_client().get_loads.return_values = FakeLoad(

View File

@ -30,7 +30,8 @@ import webob.dec
import webob.exc
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack.sdk_platform import OpenStackDriver
from dccommon.drivers.openstack.sdk_platform import OptimizedOpenStackDriver as \
OpenStackDriver
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dcmanager.rpc import client as dcmanager_rpc_client
from dcorch.api.proxy.apps.dispatcher import APIDispatcher

View File

@ -24,7 +24,9 @@ from keystoneauth1 import exceptions as keystone_exceptions
from oslo_log import log as logging
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack import sdk_platform as sdk
from dccommon.drivers.openstack.sdk_platform import (
OptimizedOpenStackDriver as OpenStackDriver
)
from dcorch.common import consts
LOG = logging.getLogger(__name__)
@ -149,8 +151,11 @@ def set_request_forward_environ(req, remote_host, remote_port):
def _get_fernet_keys():
"""Get fernet keys from sysinv."""
os_client = sdk.OpenStackDriver(region_name=dccommon_consts.CLOUD_0,
thread_name='proxy')
os_client = OpenStackDriver(
region_name=dccommon_consts.CLOUD_0,
region_clients=("sysinv",),
thread_name="proxy",
)
try:
key_list = os_client.sysinv_client.get_fernet_keys()
return [str(getattr(key, 'key')) for key in key_list]
@ -158,12 +163,13 @@ def _get_fernet_keys():
keystone_exceptions.ConnectFailure) as e:
LOG.info("get_fernet_keys: cloud {} is not reachable [{}]"
.format(dccommon_consts.CLOUD_0, str(e)))
sdk.OpenStackDriver.delete_region_clients(dccommon_consts.CLOUD_0)
OpenStackDriver.delete_region_clients(dccommon_consts.CLOUD_0)
return None
except (AttributeError, TypeError) as e:
LOG.info("get_fernet_keys error {}".format(e))
sdk.OpenStackDriver.delete_region_clients(dccommon_consts.CLOUD_0,
clear_token=True)
OpenStackDriver.delete_region_clients(
dccommon_consts.CLOUD_0, clear_token=True
)
return None
except Exception as e:
LOG.exception(e)

View File

@ -5,7 +5,6 @@
#
import eventlet
from keystoneauth1 import exceptions as keystone_exceptions
from oslo_log import log as logging
from dccommon import consts as dccommon_consts
@ -13,7 +12,6 @@ from dcorch.common import consts as dco_consts
from dcorch.common import context
from dcorch.common import exceptions
from dcorch.db import api as db_api
from dcorch.drivers.openstack import sdk
from dcorch.engine import scheduler
from dcorch.engine.sync_services.identity import IdentitySyncThread
from dcorch.engine.sync_services.sysinv import SysinvSyncThread
@ -127,7 +125,7 @@ class GenericSyncWorkerManager(object):
db_api.subcloud_sync_create(context, name, endpoint_type,
# pylint: disable-next=no-member
values={'subcloud_id': sc.id})
# Create the sync object for this engine
# Create the sync object for this engine
self.create_sync_objects(name, capabilities, management_ip)
def del_subcloud(self, context, subcloud_name):
@ -290,20 +288,6 @@ class GenericSyncWorkerManager(object):
except KeyError:
raise exceptions.SubcloudNotFound(region_name=subcloud_name)
def update_subcloud_endpoints(self, context, subcloud_name, endpoints):
try:
LOG.info(f"Updating service endpoints for subcloud {subcloud_name} "
f"in endpoint cache")
endpoint_cache = sdk.OpenStackDriver(
region_name=dccommon_consts.CLOUD_0).keystone_client.endpoint_cache
endpoint_cache.update_master_service_endpoint_region(
subcloud_name, endpoints)
except (keystone_exceptions.EndpointNotFound,
keystone_exceptions.ConnectFailure,
IndexError):
LOG.error(f"Failed to update services endpoints for "
f"subcloud: {subcloud_name} in dcorch.")
def update_subcloud_management_ip(self, context, subcloud_name, management_ip):
try:
sc = subcloud.Subcloud.get_by_name(context, subcloud_name)

View File

@ -322,10 +322,6 @@ class EngineWorkerService(service.Service):
def update_subcloud_version(self, ctxt, subcloud_name, sw_version):
self.gswm.update_subcloud_version(ctxt, subcloud_name, sw_version)
@request_context
def update_subcloud_endpoints(self, ctxt, subcloud_name, endpoints):
self.gswm.update_subcloud_endpoints(ctxt, subcloud_name, endpoints)
@request_context
def update_subcloud_management_ip(self, ctxt, subcloud_name, management_ip):
self.gswm.update_subcloud_management_ip(ctxt, subcloud_name, management_ip)
@ -341,12 +337,12 @@ class EngineWorkerService(service.Service):
except Exception as ex:
LOG.error(f"Failed to stop engine-worker service: {six.text_type(ex)}")
def stop(self):
def stop(self, graceful=False):
self._stop_rpc_server()
# Terminate the engine process
LOG.info("All threads were gone, terminating engine-worker")
super(EngineWorkerService, self).stop()
super(EngineWorkerService, self).stop(graceful)
@request_context
# The sync job info has been written to the DB, alert the sync engine

View File

@ -28,7 +28,7 @@ from dcdbsync.dbsyncclient.client import Client
from dcdbsync.dbsyncclient import exceptions as dbsync_exceptions
from dcorch.common import consts
from dcorch.common import exceptions
from dcorch.engine.sync_thread import get_os_client
from dcorch.engine.sync_thread import get_master_os_client
from dcorch.engine.sync_thread import SyncThread
from dcorch.objects import resource
@ -109,11 +109,10 @@ class IdentitySyncThread(SyncThread):
session=self.sc_admin_session)
def get_master_ks_client(self):
return get_os_client(self.master_region_name, ['dbsync']).\
keystone_client.keystone_client
return get_master_os_client().keystone_client.keystone_client
def get_master_dbs_client(self):
return get_os_client(self.master_region_name, ['dbsync']).dbsync_client
return get_master_os_client(["dbsync"]).dbsync_client
def get_sc_ks_client(self):
if self.sc_ks_client is None:

View File

@ -24,7 +24,9 @@ from oslo_serialization import jsonutils
from oslo_utils import timeutils
from dccommon import consts as dccommon_consts
from dccommon.drivers.openstack import sdk_platform as sdk
from dccommon.drivers.openstack.sdk_platform import (
OptimizedOpenStackDriver as OpenStackDriver
)
from dccommon.drivers.openstack.sysinv_v1 import SysinvClient
from dccommon.endpoint_cache import build_subcloud_endpoint
from dccommon import exceptions as dccommon_exceptions
@ -35,7 +37,7 @@ from dcorch.engine.fernet_key_manager import FERNET_REPO_MASTER_ID
from dcorch.engine.fernet_key_manager import FernetKeyManager
from dcorch.engine.sync_thread import AUDIT_RESOURCE_EXTRA
from dcorch.engine.sync_thread import AUDIT_RESOURCE_MISSING
from dcorch.engine.sync_thread import get_os_client
from dcorch.engine.sync_thread import get_master_os_client
from dcorch.engine.sync_thread import SyncThread
LOG = logging.getLogger(__name__)
@ -98,7 +100,7 @@ class SysinvSyncThread(SyncThread):
endpoint=sc_sysinv_url)
def get_master_sysinv_client(self):
return get_os_client(self.master_region_name, ['sysinv']).sysinv_client
return get_master_os_client(['sysinv']).sysinv_client
def get_sc_sysinv_client(self):
if self.sc_sysinv_client is None:
@ -133,7 +135,7 @@ class SysinvSyncThread(SyncThread):
LOG.info("{} {} region_name {} not authorized".format(
request.orch_job.operation_type, rsrc.resource_type,
self.region_name), extra=self.log_extra)
sdk.OpenStackDriver.delete_region_clients(self.region_name)
OpenStackDriver.delete_region_clients(self.region_name)
raise exceptions.SyncRequestFailedRetry
except Exception as e:
LOG.exception(e)
@ -469,7 +471,7 @@ class SysinvSyncThread(SyncThread):
"[{}]".format(resource_type,
self.region_name,
str(e)), extra=self.log_extra)
sdk.OpenStackDriver.delete_region_clients(self.region_name)
OpenStackDriver.delete_region_clients(self.region_name)
return None
except (AttributeError, TypeError) as e:
LOG.info("get subcloud_resources {} error {}".format(
@ -482,9 +484,9 @@ class SysinvSyncThread(SyncThread):
def post_audit(self):
# TODO(lzhu1): This should be revisited once the master cache service
# is implemented.
sdk.OpenStackDriver.delete_region_clients_for_thread(
OpenStackDriver.delete_region_clients_for_thread(
self.region_name, 'audit')
sdk.OpenStackDriver.delete_region_clients_for_thread(
OpenStackDriver.delete_region_clients_for_thread(
dccommon_consts.CLOUD_0, 'audit')
def get_certificates_resources(self, sysinv_client):

View File

@ -64,17 +64,18 @@ AUDIT_RESOURCE_EXTRA = 'extra_resource'
AUDIT_LOCK_NAME = 'dcorch-audit'
def get_os_client(region, region_clients):
def get_master_os_client(region_clients=None):
# Used by the master clients only. The subcloud clients don't need to be
# cached in the openstack driver, because we don't want to hold the admin
# sessions for the subclouds.
try:
os_client = sdk.OptimizedOpenStackDriver(
region_name=region,
region_name=dccommon_consts.CLOUD_0,
region_clients=region_clients)
except Exception as e:
LOG.error(
f"Failed to get os_client for {region}/{region_clients}: {e}.")
"Failed to get os_client for "
f"{dccommon_consts.CLOUD_0}/{region_clients}: {e}.")
raise e
return os_client

View File

@ -171,11 +171,6 @@ class EngineWorkerClient(object):
self.make_msg('update_subcloud_version',
subcloud_name=subcloud_name, sw_version=sw_version))
def update_subcloud_endpoints(self, ctxt, subcloud_name, endpoints):
return self.cast(ctxt, self.make_msg(
'update_subcloud_endpoints', subcloud_name=subcloud_name,
endpoints=endpoints), fanout=True, version=self.BASE_RPC_API_VERSION)
def update_subcloud_management_ip(self, ctxt, subcloud_name, management_ip):
return self.call(
ctxt,