Platform restore for storage configuration

This commit is to support platform restore for storage configuration using restore_platform playbook. During restore after storage nodes become available and ceph quorum is formed, the restored ceph crushmap will be loaded through sysinv and OSDs will be created by applying ceph osd runtime manifests. Change-Id: I4f3e503acb242556b1f48d7600bcb6358a9d5f8d Story: 2004761 Task: 36018 Signed-off-by: Wei Zhou <wei.zhou@windriver.com>
2019-07-29 12:16:17 -04:00 · 2019-07-29 12:16:17 -04:00 · 7d60b654c8
parent c4578ca90e
commit 7d60b654c8
5 changed files with 61 additions and 114 deletions
--- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py
+++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py
@ -4542,47 +4542,45 @@ class HostController(rest.RestController):
            })

        elif backend.task == constants.SB_TASK_RESTORE:
-            ihosts = api.ihost_get_by_personality(
-                constants.STORAGE
-            )
+            ihosts = api.ihost_get_by_personality(constants.STORAGE)

            if ihosts:
-                # TODO (Wei) Need to revisit storage setup.
-                LOG.info("This is a storage setup. Will need to revisit.")
-                storage_enabled = 0
-                for ihost in ihosts:
-                    if ihost.operational == constants.OPERATIONAL_ENABLED:
-                        storage_enabled = storage_enabled + 1
-
-                if storage_enabled and storage_enabled == len(ihosts):
-                    LOG.info("All storage hosts are %s. Restore crushmap..." %
-                             constants.OPERATIONAL_ENABLED)
-                    try:
-                        if not pecan.request.rpcapi.restore_ceph_config(
-                                pecan.request.context, after_storage_enabled=True):
-                            raise Exception("restore_ceph_config returned false")
-                    except Exception as e:
-                        raise wsme.exc.ClientSideError(
-                            _("Restore Ceph config failed: %s" % e))
+                LOG.info("This is a configuration with dedicated storage nodes. "
+                         "Backend task is RESTORE.")
+                # Check if ceph quorum is formed. If yes, we can clear the restore
+                # task, so that when storage nodes are unlocked, ceph crushmap will
+                # be loaded and osds will be created.
+                active_mons, required_mons, __ = \
+                    self._ceph.get_monitors_status(pecan.request.dbapi)
+                if required_mons > active_mons:
+                    LOG.info("Not enough monitors yet to restore ceph config.")
+                else:
+                    # By clearing ceph backend task to None osds will be
+                    # created by applying runtime manifests when unlocking
+                    # the storage nodes.
+                    LOG.info("Clear ceph backend task to None as part of "
+                             "storage backend restore.")
+                    api.storage_backend_update(backend.uuid, {'task': None})
            elif cutils.is_aio_simplex_system(pecan.request.dbapi):
                # For AIO-SX, ceph config restore is done in puppet when ceph
                # manifest is applied on first unlock. The
                # initial_config_complete flag is set after first unlock.
                # Once one controller is up, ceph cluster should be fully
                # operational.
-                LOG.info("This is AIO-SX... Ceph backend task is RESTORE")
+                LOG.info("This is an all-in-one simplex configuration. "
+                         "Ceph backend task is RESTORE.")
                if cutils.is_initial_config_complete():
-                    LOG.info("This is AIO-SX... clear ceph backend task to None")
+                    LOG.info("Clear ceph backend task to None as part of "
+                             "storage backend restore.")
                    api.storage_backend_update(backend.uuid, {'task': None})
            elif cutils.is_aio_duplex_system(pecan.request.dbapi):
                # For AIO-DX, ceph config restore is done in puppet when ceph
                # manifest is applied on first unlock. The 2nd osd is created
                # in puppet when controller-1 is unlocked. Once both
                # controllers are up, Ceph cluster should be fully operational.
-                LOG.info("This is AIO-DX... Ceph backend task is RESTORE")
-                c_hosts = api.ihost_get_by_personality(
-                    constants.CONTROLLER
-                )
+                LOG.info("This is an all-in-one duplex configuration. "
+                         "Ceph backend task is RESTORE.")
+                c_hosts = api.ihost_get_by_personality(constants.CONTROLLER)

                ctlr_enabled = 0
                for c_host in c_hosts:
@ -4590,13 +4588,15 @@ class HostController(rest.RestController):
                        ctlr_enabled = ctlr_enabled + 1

                if ctlr_enabled == len(c_hosts):
-                    LOG.info("This is AIO-DX... clear ceph backend task to None")
+                    LOG.info("Clear ceph backend task to None as part of "
+                             "storage backend restore.")
                    api.storage_backend_update(backend.uuid, {'task': None})
            else:
                # This is ceph restore for standard non-storage configuration.
                # Ceph config restore is done via sysinv after both ceph
                # monitors are available.
-                LOG.info("This is 2+2... Ceph backend task is RESTORE")
+                LOG.info("This is a standard configuration without dedicated "
+                         "storage nodes. Ceph backend task is RESTORE.")
                active_mons, required_mons, __ = \
                        self._ceph.get_monitors_status(pecan.request.dbapi)
                if required_mons > active_mons:
@ -4604,7 +4604,8 @@ class HostController(rest.RestController):
                else:
                    # By clearing ceph backend task to None osds will be
                    # created thru applying runtime manifests.
-                    LOG.info("This is 2+2... clear ceph backend task to None")
+                    LOG.info("Clear ceph backend task to None as part of "
+                             "storage backend restore.")
                    api.storage_backend_update(backend.uuid, {'task': None})

                    # Apply runtime manifests to create OSDs on two controller
@ -5389,11 +5390,18 @@ class HostController(rest.RestController):
                  "enabled.") %
                (num_monitors, required_monitors))

-        # Check Ceph configuration, if it is wiped out (in the Backup & Restore
-        # process) then restore the configuration.
        try:
-            if not pecan.request.rpcapi.restore_ceph_config(pecan.request.context):
-                raise Exception()
+            # If osdmap is empty which is the restore case, then create osds.
+            osd_stats = ceph_helper.get_osd_stats()
+            if int(osd_stats['num_osds']) == 0:
+                i_host = pecan.request.dbapi.ihost_get(ihost['uuid'])
+                runtime_manifests = True
+                for stor in istors:
+                    pecan.request.rpcapi.update_ceph_osd_config(
+                        pecan.request.context,
+                        i_host,
+                        stor.uuid,
+                        runtime_manifests)
        except Exception:
            raise wsme.exc.ClientSideError(
                _("Restore Ceph config failed. Retry unlocking storage node."))
--- a/sysinv/sysinv/sysinv/sysinv/common/ceph.py
+++ b/sysinv/sysinv/sysinv/sysinv/common/ceph.py
@ -19,6 +19,7 @@ import pecan
 import requests

 from cephclient import wrapper as ceph
+from requests.exceptions import ReadTimeout

 from sysinv.common import constants
 from sysinv.common import exception
@ -476,6 +477,22 @@ class CephApiOperator(object):

        return rc

+    def get_osd_stats(self, timeout=30):
+        try:
+            resp, body = self._ceph_api.osd_stat(body='json',
+                                                 timeout=timeout)
+        except ReadTimeout as e:
+            resp = type('Response', (),
+                        dict(ok=False,
+                             reason=('Ceph API osd_stat() timeout '
+                                     'after {} seconds').format(timeout)))
+        if not resp.ok:
+            e = exception.CephGetOsdStatsFailure(reason=resp.reason)
+            LOG.error(e)
+            raise e
+        else:
+            return body["output"]
+
    def _osd_quorum_names(self, timeout=10):
        quorum_names = []
        try:
--- a/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/ceph.py
@ -331,68 +331,6 @@ class CephOperator(object):
        LOG.info("osdmap is rebuilt.")
        return True

-    def restore_ceph_config(self, after_storage_enabled=False):
-        """Restore Ceph configuration during Backup and Restore process.
-
-        :returns: return True if restore is successful or no need to restore
-        """
-        # Check to make sure that the ceph manager has seen a valid Ceph REST
-        # API response. If not, then we don't have a quorum and attempting to
-        # restore the crushmap is a useless act. On a restore we may have
-        # powered off yet to be installed storage hosts that have an operational
-        # enabled state (i.e. a false positive) which gets us to this restore
-        # function.
-
-        if not self.ceph_manager_sees_cluster_up():
-            LOG.info('Aborting crushmap restore.The cluster has yet to be '
-                     'recognized as operational.')
-            return False
-
-        # TODO (Wei): This function is not invoked during AIO system restore.
-        #             It will be revisited in the non-AIO system restore tasks.
-        try:
-            backup = os.path.join(constants.SYSINV_CONFIG_PATH,
-                                  constants.CEPH_CRUSH_MAP_BACKUP)
-            if os.path.exists(backup):
-                out, err = cutils.trycmd(
-                    'ceph', 'osd', 'setcrushmap',
-                    '-i', backup,
-                    discard_warnings=True)
-                if err != '':
-                    LOG.warn(_('Failed to restore Ceph crush map. '
-                               'Reason: stdout={}, stderr={}').format(out, err))
-                    return False
-                else:
-                    os.unlink(backup)
-                    crushmap_flag_file = os.path.join(constants.SYSINV_CONFIG_PATH,
-                                                      constants.CEPH_CRUSH_MAP_APPLIED)
-                    try:
-                        open(crushmap_flag_file, "w").close()
-                    except IOError as e:
-                        LOG.warn(_('Failed to create flag file: {}. '
-                                   'Reason: {}').format(crushmap_flag_file, e))
-        except OSError as e:
-            LOG.warn(_('Failed to restore Ceph crush map. '
-                       'Reason: {}').format(e))
-            return False
-
-        if after_storage_enabled:
-            StorageBackendConfig.update_backend_states(
-                self._db_api,
-                constants.CINDER_BACKEND_CEPH,
-                task=constants.SB_TASK_NONE
-            )
-            return True
-
-        # check if osdmap is emtpy as an indication for Backup and Restore
-        # case where ceph config needs to be restored.
-        osd_stats = self.get_osd_stats()
-        if int(osd_stats['num_osds']) > 0:
-            return True
-
-        LOG.info("osdmap is empty, restoring Ceph config...")
-        return self.rebuild_osdmap()
-
    # TODO(CephPoolsDecouple): remove
    def _pool_create(self, name, pg_num, pgp_num, ruleset,
                     size, min_size):
@ -949,6 +887,9 @@ class CephOperator(object):
        else:
            return body["output"]["pools"]

+    # TODO(CephPoolsDecouple): remove
+    # This function is only called from audit_osd_quotas_for_tier() which
+    # will be removed by CephPoolsDecouple.
    def get_osd_stats(self, timeout=30):
        try:
            resp, body = self._ceph_api.osd_stat(body='json',
--- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py
@ -5146,15 +5146,6 @@ class ConductorManager(service.PeriodicService):
        # Not sure yet what the proper response is here
        pass

-    def restore_ceph_config(self, context, after_storage_enabled=False):
-        """Restore Ceph configuration during Backup and Restore process.
-
-        :param context: request context.
-        :returns: return True if restore is successful or no need to restore
-        """
-        return self._ceph.restore_ceph_config(
-            after_storage_enabled=after_storage_enabled)
-
    def get_ceph_pool_replication(self, context, ceph_backend=None):
        """Get ceph storage backend pool replication parameters

--- a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py
+++ b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py
@ -612,16 +612,6 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
                         self.make_msg('unconfigure_osd_istor',
                                       istor_obj=istor_obj))

-    def restore_ceph_config(self, context, after_storage_enabled=False):
-        """Restore Ceph configuration during Backup and Restore process.
-
-        :param context: request context.
-        :returns: return True if restore is successful or no need to restore
-        """
-        return self.call(context,
-                         self.make_msg('restore_ceph_config',
-                                       after_storage_enabled=after_storage_enabled))
-
    def get_ceph_pool_replication(self, context, ceph_backend=None):
        """Get ceph storage backend pool replication parameters