Files
distcloud/distributedcloud/dcmanager/orchestrator/states/upgrade/pre_check.py
Jessica Castelino eb97f4c8b6 Move dcmanager orchestration to a separate process
1) Remove DC manager orchestration from dcmanager-manager process
2) Create dcmanager-orchestrator process and associated files
3) Add new RPC calls for dcmanager-orchestrator process to notify
dcmanager
4) Create/update unit tests, to verify the implementation
changes

Story: 2007267
Task: 40734
Change-Id: Ibbbae77558a8a8fd95b636fa6c3aebb1dfefb514
Signed-off-by: Jessica Castelino <jessica.castelino@windriver.com>
2020-09-14 11:17:06 -04:00

166 lines
7.1 KiB
Python

#
# Copyright (c) 2020 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
import re
from dccommon.drivers.openstack.sysinv_v1 import HOST_FS_NAME_SCRATCH
from dcmanager.common import consts
from dcmanager.common.exceptions import ManualRecoveryRequiredException
from dcmanager.common.exceptions import PreCheckFailedException
from dcmanager.db import api as db_api
from dcmanager.orchestrator.states.base import BaseState
# These deploy states should transition to the 'upgrading' state
VALID_UPGRADE_STATES = [consts.DEPLOY_STATE_PRE_INSTALL_FAILED,
consts.DEPLOY_STATE_INSTALL_FAILED,
consts.DEPLOY_STATE_DATA_MIGRATION_FAILED, ]
# These deploy states should transition to the 'migrating_data' state
VALID_MIGRATE_DATA_STATES = [consts.DEPLOY_STATE_INSTALLED, ]
# These deploy states should transition to the 'activating_upgrade' state
VALID_ACTIVATION_STATES = [consts.DEPLOY_STATE_MIGRATED, ]
MIN_SCRATCH_SIZE_REQUIRED_GB = 16
class PreCheckState(BaseState):
"""This State performs entry checks and skips to the appropriate state"""
def __init__(self, region_name):
super(PreCheckState, self).__init__(
next_state=consts.STRATEGY_STATE_INSTALLING_LICENSE, region_name=region_name)
def _perform_subcloud_online_checks(self, strategy_step, subcloud):
# obtain necessary clients
subcloud_sysinv_client = None
try:
subcloud_sysinv_client = self.get_sysinv_client(strategy_step.subcloud.name)
except Exception:
# if getting the token times out, the orchestrator may have
# restarted and subcloud may be offline; so will attempt
# to use the persisted values
message = ("_perform_subcloud_online_checks subcloud %s "
"failed to get subcloud client" %
strategy_step.subcloud.name)
self.error_log(strategy_step, message)
raise ManualRecoveryRequiredException(
subcloud=strategy_step.subcloud.name,
deploy_status=subcloud.deploy_status)
# check system health
#
# Sample output #1
# ================
# Some non-management affecting alarms, all other checks passed
#
# System Health:
# All hosts are provisioned: [OK]
# All hosts are unlocked/enabled: [OK]
# All hosts have current configurations: [OK]
# All hosts are patch current: [OK]
# Ceph Storage Healthy: [OK]
# No alarms: [Fail]
# [1] alarms found, [0] of which are management affecting
# All kubernetes nodes are ready: [OK]
# All kubernetes control plane pods are ready: [OK]
#
# Sample output #2
# ================
# Multiple failed checks, management affecting alarms
#
# System Health:
# All hosts are provisioned: [OK]
# All hosts are unlocked/enabled: [OK]
# All hosts have current configurations: [OK]
# All hosts are patch current: [OK]
# Ceph Storage Healthy: [Fail]
# No alarms: [Fail]
# [7] alarms found, [2] of which are management affecting
# All kubernetes nodes are ready: [OK]
# All kubernetes control plane pods are ready: [OK]
system_health = subcloud_sysinv_client.get_system_health()
fails = re.findall("\[Fail\]", system_health)
failed_alarm_check = re.findall("No alarms: \[Fail\]", system_health)
no_mgmt_alarms = re.findall("\[0\] of which are management affecting",
system_health)
# The only 2 health conditions acceptable for upgrade are:
# a) subcloud is completely healthy (i.e. no failed checks)
# b) subcloud only fails alarm check and it only has non-management
# affecting alarm(s)
if ((len(fails) == 0) or
(len(fails) == 1 and failed_alarm_check and no_mgmt_alarms)):
self.info_log(strategy_step, "health check passed.")
else:
details = "System health check failed. Please run 'system health-query' " \
"command on the subcloud for more details."
self.error_log(strategy_step, "\n" + system_health)
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,
details=details,
)
# check scratch
host = subcloud_sysinv_client.get_host("controller-0")
scratch_fs = subcloud_sysinv_client.get_host_filesystem(
host.uuid, HOST_FS_NAME_SCRATCH)
if scratch_fs.size < MIN_SCRATCH_SIZE_REQUIRED_GB:
details = ("Scratch filesystem size of %s does not meet "
"minimum required %s" %
(scratch_fs.size, MIN_SCRATCH_SIZE_REQUIRED_GB))
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,
details=details,
)
def perform_state_action(self, strategy_step):
"""This state will check if the subcloud is offline:
Check the deploy_status and transfer to the correct state.
if an unsupported deploy_status is encountered, fail the upgrade
"""
subcloud = db_api.subcloud_get(self.context, strategy_step.subcloud.id)
# check presence of data_install values. These are managed
# semantically on subcloud add or update
if not subcloud.data_install:
details = ("Data install values are missing and must be updated "
"via dcmanager subcloud update")
raise PreCheckFailedException(
subcloud=strategy_step.subcloud.name,
details=details)
if subcloud.availability_status == consts.AVAILABILITY_ONLINE:
self._perform_subcloud_online_checks(strategy_step, subcloud)
# If the subcloud has completed data migration and is online,
# advance directly to activating upgrade step. Otherwise, start
# from installing license step.
if subcloud.deploy_status == consts.DEPLOY_STATE_MIGRATED:
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
return self.next_state
# it is offline.
if subcloud.deploy_status in VALID_UPGRADE_STATES:
self.override_next_state(consts.STRATEGY_STATE_UPGRADING_SIMPLEX)
return self.next_state
elif subcloud.deploy_status in VALID_MIGRATE_DATA_STATES:
self.override_next_state(consts.STRATEGY_STATE_MIGRATING_DATA)
return self.next_state
elif subcloud.deploy_status in VALID_ACTIVATION_STATES:
self.override_next_state(consts.STRATEGY_STATE_ACTIVATING_UPGRADE)
return self.next_state
# FAIL: We are offline and encountered an un-recoverable deploy status
self.info_log(strategy_step,
"Un-handled deploy_status: %s" % subcloud.deploy_status)
raise ManualRecoveryRequiredException(
subcloud=strategy_step.subcloud.name,
deploy_status=subcloud.deploy_status)