diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py index 945ee1a665..08c30240cf 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py @@ -4542,47 +4542,45 @@ class HostController(rest.RestController): }) elif backend.task == constants.SB_TASK_RESTORE: - ihosts = api.ihost_get_by_personality( - constants.STORAGE - ) + ihosts = api.ihost_get_by_personality(constants.STORAGE) if ihosts: - # TODO (Wei) Need to revisit storage setup. - LOG.info("This is a storage setup. Will need to revisit.") - storage_enabled = 0 - for ihost in ihosts: - if ihost.operational == constants.OPERATIONAL_ENABLED: - storage_enabled = storage_enabled + 1 - - if storage_enabled and storage_enabled == len(ihosts): - LOG.info("All storage hosts are %s. Restore crushmap..." % - constants.OPERATIONAL_ENABLED) - try: - if not pecan.request.rpcapi.restore_ceph_config( - pecan.request.context, after_storage_enabled=True): - raise Exception("restore_ceph_config returned false") - except Exception as e: - raise wsme.exc.ClientSideError( - _("Restore Ceph config failed: %s" % e)) + LOG.info("This is a configuration with dedicated storage nodes. " + "Backend task is RESTORE.") + # Check if ceph quorum is formed. If yes, we can clear the restore + # task, so that when storage nodes are unlocked, ceph crushmap will + # be loaded and osds will be created. + active_mons, required_mons, __ = \ + self._ceph.get_monitors_status(pecan.request.dbapi) + if required_mons > active_mons: + LOG.info("Not enough monitors yet to restore ceph config.") + else: + # By clearing ceph backend task to None osds will be + # created by applying runtime manifests when unlocking + # the storage nodes. + LOG.info("Clear ceph backend task to None as part of " + "storage backend restore.") + api.storage_backend_update(backend.uuid, {'task': None}) elif cutils.is_aio_simplex_system(pecan.request.dbapi): # For AIO-SX, ceph config restore is done in puppet when ceph # manifest is applied on first unlock. The # initial_config_complete flag is set after first unlock. # Once one controller is up, ceph cluster should be fully # operational. - LOG.info("This is AIO-SX... Ceph backend task is RESTORE") + LOG.info("This is an all-in-one simplex configuration. " + "Ceph backend task is RESTORE.") if cutils.is_initial_config_complete(): - LOG.info("This is AIO-SX... clear ceph backend task to None") + LOG.info("Clear ceph backend task to None as part of " + "storage backend restore.") api.storage_backend_update(backend.uuid, {'task': None}) elif cutils.is_aio_duplex_system(pecan.request.dbapi): # For AIO-DX, ceph config restore is done in puppet when ceph # manifest is applied on first unlock. The 2nd osd is created # in puppet when controller-1 is unlocked. Once both # controllers are up, Ceph cluster should be fully operational. - LOG.info("This is AIO-DX... Ceph backend task is RESTORE") - c_hosts = api.ihost_get_by_personality( - constants.CONTROLLER - ) + LOG.info("This is an all-in-one duplex configuration. " + "Ceph backend task is RESTORE.") + c_hosts = api.ihost_get_by_personality(constants.CONTROLLER) ctlr_enabled = 0 for c_host in c_hosts: @@ -4590,13 +4588,15 @@ class HostController(rest.RestController): ctlr_enabled = ctlr_enabled + 1 if ctlr_enabled == len(c_hosts): - LOG.info("This is AIO-DX... clear ceph backend task to None") + LOG.info("Clear ceph backend task to None as part of " + "storage backend restore.") api.storage_backend_update(backend.uuid, {'task': None}) else: # This is ceph restore for standard non-storage configuration. # Ceph config restore is done via sysinv after both ceph # monitors are available. - LOG.info("This is 2+2... Ceph backend task is RESTORE") + LOG.info("This is a standard configuration without dedicated " + "storage nodes. Ceph backend task is RESTORE.") active_mons, required_mons, __ = \ self._ceph.get_monitors_status(pecan.request.dbapi) if required_mons > active_mons: @@ -4604,7 +4604,8 @@ class HostController(rest.RestController): else: # By clearing ceph backend task to None osds will be # created thru applying runtime manifests. - LOG.info("This is 2+2... clear ceph backend task to None") + LOG.info("Clear ceph backend task to None as part of " + "storage backend restore.") api.storage_backend_update(backend.uuid, {'task': None}) # Apply runtime manifests to create OSDs on two controller @@ -5389,11 +5390,18 @@ class HostController(rest.RestController): "enabled.") % (num_monitors, required_monitors)) - # Check Ceph configuration, if it is wiped out (in the Backup & Restore - # process) then restore the configuration. try: - if not pecan.request.rpcapi.restore_ceph_config(pecan.request.context): - raise Exception() + # If osdmap is empty which is the restore case, then create osds. + osd_stats = ceph_helper.get_osd_stats() + if int(osd_stats['num_osds']) == 0: + i_host = pecan.request.dbapi.ihost_get(ihost['uuid']) + runtime_manifests = True + for stor in istors: + pecan.request.rpcapi.update_ceph_osd_config( + pecan.request.context, + i_host, + stor.uuid, + runtime_manifests) except Exception: raise wsme.exc.ClientSideError( _("Restore Ceph config failed. Retry unlocking storage node.")) diff --git a/sysinv/sysinv/sysinv/sysinv/common/ceph.py b/sysinv/sysinv/sysinv/sysinv/common/ceph.py index 3242c5f184..c6fedb2c3f 100644 --- a/sysinv/sysinv/sysinv/sysinv/common/ceph.py +++ b/sysinv/sysinv/sysinv/sysinv/common/ceph.py @@ -19,6 +19,7 @@ import pecan import requests from cephclient import wrapper as ceph +from requests.exceptions import ReadTimeout from sysinv.common import constants from sysinv.common import exception @@ -476,6 +477,22 @@ class CephApiOperator(object): return rc + def get_osd_stats(self, timeout=30): + try: + resp, body = self._ceph_api.osd_stat(body='json', + timeout=timeout) + except ReadTimeout as e: + resp = type('Response', (), + dict(ok=False, + reason=('Ceph API osd_stat() timeout ' + 'after {} seconds').format(timeout))) + if not resp.ok: + e = exception.CephGetOsdStatsFailure(reason=resp.reason) + LOG.error(e) + raise e + else: + return body["output"] + def _osd_quorum_names(self, timeout=10): quorum_names = [] try: diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py b/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py index 7c128e8b6e..816ba2a7c7 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py @@ -331,68 +331,6 @@ class CephOperator(object): LOG.info("osdmap is rebuilt.") return True - def restore_ceph_config(self, after_storage_enabled=False): - """Restore Ceph configuration during Backup and Restore process. - - :returns: return True if restore is successful or no need to restore - """ - # Check to make sure that the ceph manager has seen a valid Ceph REST - # API response. If not, then we don't have a quorum and attempting to - # restore the crushmap is a useless act. On a restore we may have - # powered off yet to be installed storage hosts that have an operational - # enabled state (i.e. a false positive) which gets us to this restore - # function. - - if not self.ceph_manager_sees_cluster_up(): - LOG.info('Aborting crushmap restore.The cluster has yet to be ' - 'recognized as operational.') - return False - - # TODO (Wei): This function is not invoked during AIO system restore. - # It will be revisited in the non-AIO system restore tasks. - try: - backup = os.path.join(constants.SYSINV_CONFIG_PATH, - constants.CEPH_CRUSH_MAP_BACKUP) - if os.path.exists(backup): - out, err = cutils.trycmd( - 'ceph', 'osd', 'setcrushmap', - '-i', backup, - discard_warnings=True) - if err != '': - LOG.warn(_('Failed to restore Ceph crush map. ' - 'Reason: stdout={}, stderr={}').format(out, err)) - return False - else: - os.unlink(backup) - crushmap_flag_file = os.path.join(constants.SYSINV_CONFIG_PATH, - constants.CEPH_CRUSH_MAP_APPLIED) - try: - open(crushmap_flag_file, "w").close() - except IOError as e: - LOG.warn(_('Failed to create flag file: {}. ' - 'Reason: {}').format(crushmap_flag_file, e)) - except OSError as e: - LOG.warn(_('Failed to restore Ceph crush map. ' - 'Reason: {}').format(e)) - return False - - if after_storage_enabled: - StorageBackendConfig.update_backend_states( - self._db_api, - constants.CINDER_BACKEND_CEPH, - task=constants.SB_TASK_NONE - ) - return True - - # check if osdmap is emtpy as an indication for Backup and Restore - # case where ceph config needs to be restored. - osd_stats = self.get_osd_stats() - if int(osd_stats['num_osds']) > 0: - return True - - LOG.info("osdmap is empty, restoring Ceph config...") - return self.rebuild_osdmap() - # TODO(CephPoolsDecouple): remove def _pool_create(self, name, pg_num, pgp_num, ruleset, size, min_size): @@ -949,6 +887,9 @@ class CephOperator(object): else: return body["output"]["pools"] + # TODO(CephPoolsDecouple): remove + # This function is only called from audit_osd_quotas_for_tier() which + # will be removed by CephPoolsDecouple. def get_osd_stats(self, timeout=30): try: resp, body = self._ceph_api.osd_stat(body='json', diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 9b0a4152a8..9efc03a033 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -5146,15 +5146,6 @@ class ConductorManager(service.PeriodicService): # Not sure yet what the proper response is here pass - def restore_ceph_config(self, context, after_storage_enabled=False): - """Restore Ceph configuration during Backup and Restore process. - - :param context: request context. - :returns: return True if restore is successful or no need to restore - """ - return self._ceph.restore_ceph_config( - after_storage_enabled=after_storage_enabled) - def get_ceph_pool_replication(self, context, ceph_backend=None): """Get ceph storage backend pool replication parameters diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py index 9c54eba7f0..78f1c92ddc 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py @@ -612,16 +612,6 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy): self.make_msg('unconfigure_osd_istor', istor_obj=istor_obj)) - def restore_ceph_config(self, context, after_storage_enabled=False): - """Restore Ceph configuration during Backup and Restore process. - - :param context: request context. - :returns: return True if restore is successful or no need to restore - """ - return self.call(context, - self.make_msg('restore_ceph_config', - after_storage_enabled=after_storage_enabled)) - def get_ceph_pool_replication(self, context, ceph_backend=None): """Get ceph storage backend pool replication parameters