Fix lifecycle pre apply checks
When one host has its availability changed, the application lifecycle
framework will call a hook with an 'evaluate-reapply' operation and
with the 'relative_timing' field missing.
This hook should block the 'apply' and 'evaluate-reapply' operations
if Ceph is unresponvise. Not blocking these operations will lead to a
scenario where the ceph-csi configuration gets broken and the PVCs
fail to attach, mount and umount.
Example of hook_info dict when the 'evaluate-reapply' operation is
started:
{'mode': 'auto', 'lifecycle_type': 'check', 'operation': \
'evaluate-reapply', 'extra': {'trigger': {'type': \
'host-availability-updated', 'availability': 'available'}}}
Example of hook_info dict when the 'apply' operations is started:
{'mode': 'auto', 'lifecycle_type': 'check', 'relative_timing': 'pre',\
'operation': 'apply', 'extra': {}}
This change fixes the semantic check for 'evaluate-reapply' and
'apply' operations, considering the 'relative_timing' field.
Test-Plan:
PASS: On AIO-DX restart the standby controller and when it is back
'available' verify if the auto reapply is blocked if Ceph
status is not HEALTH_OK
PASS: Verify application update is working
PASS: Verify application auto apply is working
PASS: Add a new storage tier and verify the application is
re-applied correctly.
PASS: Manual and auto application apply is blocked when Ceph status
is not HEALTH_OK
Closes-bug: 2097570
Change-Id: I09bc9b0bfe53ddae3008ae164a177f3ea5a6ae63
Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@@ -153,6 +153,8 @@ class CephFSProvisionerHelm(base.FluxCDBaseHelm):
|
||||
# Get tier info.
|
||||
tiers = self.dbapi.storage_tier_get_list()
|
||||
cluster_id = cutils.get_ceph_fsid()
|
||||
if not cluster_id:
|
||||
raise Exception("Could not identify Ceph cluster fsid. Try again when ceph cli is responsive.")
|
||||
storage_classes = []
|
||||
|
||||
for bk in ceph_bks:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2020-2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2020-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@@ -71,6 +71,8 @@ class RbdProvisionerHelm(base.FluxCDBaseHelm):
|
||||
# Get tier info.
|
||||
tiers = self.dbapi.storage_tier_get_list()
|
||||
cluster_id = cutils.get_ceph_fsid()
|
||||
if not cluster_id:
|
||||
raise Exception("Could not identify Ceph cluster fsid. Try again when ceph cli is responsive.")
|
||||
storage_classes = []
|
||||
|
||||
for bk in ceph_bks:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2021-2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2021-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@@ -12,6 +12,7 @@
|
||||
# pylint: disable=no-member
|
||||
# pylint: disable=no-name-in-module
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
from oslo_log import log as logging
|
||||
from sysinv.common import constants
|
||||
@@ -38,11 +39,15 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
|
||||
"""
|
||||
# Semantic checks
|
||||
if hook_info.lifecycle_type == LifecycleConstants.APP_LIFECYCLE_TYPE_SEMANTIC_CHECK:
|
||||
if hook_info.mode == LifecycleConstants.APP_LIFECYCLE_MODE_AUTO and \
|
||||
((hook_info.operation == constants.APP_APPLY_OP and
|
||||
hook_info.relative_timing == LifecycleConstants.APP_LIFECYCLE_TIMING_PRE) or
|
||||
hook_info.mode == constants.APP_EVALUATE_REAPPLY_OP):
|
||||
return self.pre_auto_apply_check(conductor_obj)
|
||||
# The kube_app logic does not send the hook_info.relative_timing value
|
||||
# when this is an APP_EVALUATE_REAPLY_OP operation.
|
||||
# Therefore, check the hook_info.operation first and validate if the
|
||||
# relative_timing is provided. If it is not, run the pre-apply checks.
|
||||
if hook_info.operation in [constants.APP_APPLY_OP,
|
||||
constants.APP_EVALUATE_REAPPLY_OP]:
|
||||
if "relative_timing" not in hook_info or \
|
||||
hook_info.relative_timing == LifecycleConstants.APP_LIFECYCLE_TIMING_PRE:
|
||||
return self.pre_apply_check(conductor_obj)
|
||||
|
||||
# Rbd
|
||||
elif hook_info.lifecycle_type == LifecycleConstants.APP_LIFECYCLE_TYPE_RBD:
|
||||
@@ -67,14 +72,15 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
|
||||
# Use the default behaviour for other hooks
|
||||
super(PlatformAppLifecycleOperator, self).app_lifecycle_actions(context, conductor_obj, app_op, app, hook_info)
|
||||
|
||||
def pre_auto_apply_check(self, conductor_obj):
|
||||
""" Semantic check for auto-apply
|
||||
def pre_apply_check(self, conductor_obj):
|
||||
""" Semantic check for apply
|
||||
|
||||
Check:
|
||||
- ceph access
|
||||
- ceph health
|
||||
- crushmap applied
|
||||
- replica count is non-zero so that manifest apply will not timeout
|
||||
- ceph cli is responsive as it will be used by the application during the apply
|
||||
|
||||
:param conductor_obj: conductor object
|
||||
|
||||
@@ -96,7 +102,7 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
|
||||
"CephOperator is not initialized yet")
|
||||
if not conductor_obj._ceph.have_ceph_monitor_access():
|
||||
raise exception.LifecycleSemanticCheckException(
|
||||
"Monitor access error")
|
||||
"Ceph monitor is unreacheable")
|
||||
if not conductor_obj._ceph.ceph_status_ok():
|
||||
raise exception.LifecycleSemanticCheckException(
|
||||
"Ceph status is not HEALTH_OK")
|
||||
@@ -110,6 +116,13 @@ class PlatformAppLifecycleOperator(base.AppLifecycleOperator):
|
||||
raise exception.LifecycleSemanticCheckException(
|
||||
"Not enough hosts in desired state")
|
||||
|
||||
# Check if ceph cli is responsive.
|
||||
ceph_fsid_cmd = ["timeout", "10", "ceph", "fsid"]
|
||||
result = subprocess.run(ceph_fsid_cmd, check=False)
|
||||
if (result.returncode != 0):
|
||||
raise exception.LifecycleSemanticCheckException(
|
||||
"Ceph CLI is not responsive")
|
||||
|
||||
def pre_apply(self, app_op, app, hook_info):
|
||||
"""Pre Apply actions
|
||||
|
||||
|
||||
Reference in New Issue
Block a user