Restore ceph during system restore

On AIO and standard setup, ceph osd(s) are running on the
controller nodes. During restore after ceph is operational
the backup crushmap needs to be set before creating osd(s).

Change-Id: Id2a7c666fa3670c460f412cfc0184ad0a9f6ccff
Story: 2004761
Task: 30522
Signed-off-by: Wei Zhou <wei.zhou@windriver.com>
This commit is contained in:
Wei Zhou 2019-05-09 19:10:45 -04:00
parent 686d10c9cf
commit a03cbf1b67
5 changed files with 93 additions and 41 deletions

View File

@ -236,13 +236,20 @@ class platform::ceph::monitor
# ensure we load the crushmap at first unlock
if $system_type == 'All-in-one' and str2bool($::is_standalone_controller) {
$software_version = $::platform::params::software_version
if 'duplex' in $system_mode {
$crushmap_txt = '/etc/sysinv/crushmap-controller-model.txt'
} else {
$crushmap_txt = '/etc/sysinv/crushmap-aio-sx.txt'
}
$crushmap_bin = '/etc/sysinv/crushmap.bin'
$crushmap_backup_bin = "/opt/platform/sysinv/${software_version}/crushmap.bin.backup"
Ceph::Mon <| |>
-> exec { 'Copy crushmap if backup exists':
command => "cp -f ${crushmap_backup_bin} ${crushmap_bin} && rm -f ${crushmap_backup_bin}",
onlyif => "test -f ${crushmap_backup_bin}",
}
-> exec { 'Compile crushmap':
command => "crushtool -c ${crushmap_txt} -o ${crushmap_bin}",
onlyif => "test ! -f ${crushmap_bin}",
@ -361,32 +368,38 @@ class platform::ceph::osds(
$osd_config = {},
$journal_config = {},
) inherits ::platform::ceph::params {
$system_type = $::platform::params::system_type
file { '/var/lib/ceph/osd':
ensure => 'directory',
path => '/var/lib/ceph/osd',
owner => 'root',
group => 'root',
mode => '0755',
# When applying controller manifest during restore the backed-up
# controller-0 hieradata contains osd info in osd_config but
# osd shouldn't be created as Ceph is down.
if $system_type == 'All-in-one' or ! str2bool($::is_standalone_controller) {
file { '/var/lib/ceph/osd':
ensure => 'directory',
path => '/var/lib/ceph/osd',
owner => 'root',
group => 'root',
mode => '0755',
}
# Ensure ceph.conf is complete before configuring OSDs
Class['::ceph'] -> Platform_ceph_osd <| |>
# Journal disks need to be prepared before the OSDs are configured
Platform_ceph_journal <| |> -> Platform_ceph_osd <| |>
# Crush locations in ceph.conf need to be set before the OSDs are configured
Osd_crush_location <| |> -> Platform_ceph_osd <| |>
# default configuration for all ceph object resources
Ceph::Osd {
cluster => $cluster_name,
cluster_uuid => $cluster_uuid,
}
create_resources('osd_crush_location', $osd_config)
create_resources('platform_ceph_osd', $osd_config)
create_resources('platform_ceph_journal', $journal_config)
}
# Ensure ceph.conf is complete before configuring OSDs
Class['::ceph'] -> Platform_ceph_osd <| |>
# Journal disks need to be prepared before the OSDs are configured
Platform_ceph_journal <| |> -> Platform_ceph_osd <| |>
# Crush locations in ceph.conf need to be set before the OSDs are configured
Osd_crush_location <| |> -> Platform_ceph_osd <| |>
# default configuration for all ceph object resources
Ceph::Osd {
cluster => $cluster_name,
cluster_uuid => $cluster_uuid,
}
create_resources('osd_crush_location', $osd_config)
create_resources('platform_ceph_osd', $osd_config)
create_resources('platform_ceph_journal', $journal_config)
}
class platform::ceph::haproxy

View File

@ -4510,8 +4510,7 @@ class HostController(rest.RestController):
else:
return False
@staticmethod
def _update_add_ceph_state():
def _update_add_ceph_state(self):
api = pecan.request.dbapi
backend = StorageBackendConfig.get_configuring_backend(api)
@ -4619,19 +4618,49 @@ class HostController(rest.RestController):
raise wsme.exc.ClientSideError(
_("Restore Ceph config failed: %s" % e))
elif utils.is_aio_system(pecan.request.dbapi):
# TODO(wz): Need more work to restore ceph for AIO
LOG.info("For an AIO system, Restore crushmap...")
try:
if not pecan.request.rpcapi.restore_ceph_config(
pecan.request.context, after_storage_enabled=True):
raise Exception("restore_ceph_config returned false")
except Exception as e:
raise wsme.exc.ClientSideError(
_("Restore Ceph config failed: %s" % e))
# For AIO, Ceph restore is done in ceph puppet
if not os.path.isfile(tsc.RESTORE_IN_PROGRESS_FLAG):
api.storage_backend_update(backend.uuid, {'task': None})
else:
# TODO(wz): Need more work to restore ceph for 2+2
pass
# This is ceph restore for 2+2.
# If config_controller restore is still in progress, we wait.
if os.path.isfile(tsc.RESTORE_IN_PROGRESS_FLAG):
LOG.info("Restore flag is still on. Do nothing now. ")
return
active_mons, required_mons, __ = \
self._ceph.get_monitors_status(pecan.request.dbapi)
if required_mons > active_mons:
LOG.info("Not enough monitors yet available to fix crushmap.")
else:
LOG.info("Restore Ceph config ...")
# First restore ceph config
try:
if not pecan.request.rpcapi.restore_ceph_config(
pecan.request.context):
raise Exception("restore_ceph_config returned false")
except Exception as e:
raise wsme.exc.ClientSideError(
_("Restore Ceph config failed: %s" % e))
# Set Ceph backend task to None
api.storage_backend_update(backend.uuid, {'task': None})
# Apply runtime manifests for OSDs on two controller nodes.
c_hosts = api.ihost_get_by_personality(
constants.CONTROLLER
)
runtime_manifests = True
for c_host in c_hosts:
istors = pecan.request.dbapi.istor_get_by_ihost(c_host.uuid)
for stor in istors:
pecan.request.rpcapi.update_ceph_osd_config(
pecan.request.context,
c_host,
stor.uuid,
runtime_manifests)
@staticmethod
def update_ihost_action(action, hostupdate):

View File

@ -725,11 +725,12 @@ def fix_crushmap(dbapi=None):
LOG.info("Not enough monitors yet available to fix crushmap.")
return False
# Crushmap may be already loaded thorough puppet, avoid doing it twice.
# Crushmap may be already loaded through puppet, avoid doing it twice.
default_ceph_tier_name = constants.SB_TIER_DEFAULT_NAMES[
constants.SB_TIER_TYPE_CEPH] + constants.CEPH_CRUSH_TIER_SUFFIX
rule_is_present, __, __ = _operator._crush_rule_status(default_ceph_tier_name)
if rule_is_present:
LOG.info("Crushmap is already loaded through puppet.")
_create_crushmap_flag_file()
return False

View File

@ -371,6 +371,7 @@ class CephOperator(object):
except IOError as e:
LOG.warn(_('Failed to create flag file: {}. '
'Reason: {}').format(crushmap_flag_file, e))
LOG.info("Ceph crushmap is set.")
except OSError as e:
LOG.warn(_('Failed to restore Ceph crush map. '
'Reason: {}').format(e))
@ -390,7 +391,7 @@ class CephOperator(object):
if int(osd_stats['num_osds']) > 0:
return True
LOG.info("osdmap is empty, restoring Ceph config...")
LOG.info("osdmap is empty, creating osds...")
return self.rebuild_osdmap()
# TODO(CephPoolsDecouple): remove

View File

@ -157,7 +157,15 @@ class CephPuppet(openstack.OpenstackBasePuppet):
def get_host_config(self, host):
config = {}
if host.personality in [constants.CONTROLLER, constants.STORAGE]:
backend = StorageBackendConfig.get_configured_backend(
self.dbapi,
constants.CINDER_BACKEND_CEPH)
# Do not write osd_config in controller hieradata
# during restore
if host.personality == constants.STORAGE:
config.update(self._get_ceph_osd_config(host))
elif (host.personality == constants.CONTROLLER and
backend.task != constants.SB_TASK_RESTORE):
config.update(self._get_ceph_osd_config(host))
config.update(self._get_ceph_mon_config(host))