Preserving the current states of Ceph SM services
The ceph-osd and ceph-mon SM services have been presenting a state
mismatch in some network recovery scenarios, where they are expected
to be disabled by the SM, but reports as enabled-active based on Ceph
availability.
In this change, the proposal is to save the current SM state in the
/var/run/ceph/.sm-ceph-mon-state and /var/run/ceph/.sm-ceph-osd-state
files based on Start and Stop actions, and return disabled only if
the save state is Stopped. In other cases, the ceph-init-wrapper script
will process the status as before.
In the ceph-init-wrapper script, we are limiting the current state
update to executions called by the SM based on the parent process.
This prevent external script uses such as manual interventions and
PMON calls from affecting the behavior of SM, which is only used
in AIO-DX setups.
In addition, the flag used by the ceph-storage-network script has been
renamed to maintain a pattern across created flags.
This change is part of a solution to avoid the scenario where
there is no active controller after a network recovery. From a storage
perpective, all services are responding accordinly to SM requests.
Other solutions for SM, mtcCLient or other services are needed to be
adressed in future investigations.
Test Plan:
PASS: Fresh install for AIO-SX and AIO-DX.
PASS: In virtual environments, simulate switch failure shutting down
all interfaces from both controllers at same time and checking
if Ceph services states are not in mismatch state.
PASS: On AIO-DX, execute host-swact operations sucessfully.
PASS: On AIO-DX, simulate BMC shutdown for standby controller and
checking Ceph services states are correctly after booting.
PASS: On AIO-DX, simulate BMC shutdown for active controller and
checking if the Uncontrolled Swact happens sucessfully and
if the Ceph services are correctly after booting.
PASS: On AIO-DX, simulate DOR scenario shutting down both controllers
at same time. Check if Ceph services are correctly
after booting.
Closes-bug: 2122117
Change-Id: Iafc0e30a441b1975ccfb98c16c4b30a53383d83e
Signed-off-by: Hediberto C Silva <Hediberto.CavalcantedaSilva@windriver.com>
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2019-2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2019-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@@ -76,6 +76,14 @@ LOCK_CEPH_OSD_STATUS_FILE="$VOLATILE_PATH/.ceph_osd_service"
|
||||
MONITOR_STATUS_TIMEOUT=30
|
||||
MAX_STATUS_TIMEOUT=120
|
||||
|
||||
STATE_RUNNING="Running"
|
||||
STATE_STOPPED="Stopped"
|
||||
|
||||
SM_CEPH_MON_STATE_FILE="$VOLATILE_PATH/ceph/.sm-ceph-mon-state"
|
||||
SM_CEPH_OSD_STATE_FILE="$VOLATILE_PATH/ceph/.sm-ceph-osd-state"
|
||||
SM_CEPH_MON_CURRENT_STATE=$(cat ${SM_CEPH_MON_STATE_FILE} 2>/dev/null)
|
||||
SM_CEPH_OSD_CURRENT_STATE=$(cat ${SM_CEPH_OSD_STATE_FILE} 2>/dev/null)
|
||||
|
||||
RC=0
|
||||
|
||||
# SM can only pass arguments through environment variable
|
||||
@@ -91,6 +99,63 @@ else
|
||||
IFS=" " read -r -a args <<< "$@"
|
||||
fi
|
||||
|
||||
is_ppid_sm()
|
||||
{
|
||||
local ppid_name
|
||||
ppid_name=$(cat /proc/${PPID}/comm)
|
||||
if [[ $ppid_name == "sm" ]]; then
|
||||
return 0
|
||||
fi
|
||||
return 1
|
||||
}
|
||||
|
||||
# Save service current state
|
||||
save_state()
|
||||
{
|
||||
local ppid_name
|
||||
local state_file="$1"
|
||||
local new_state="$2"
|
||||
if is_ppid_sm; then
|
||||
if [[ ${state_file} == "${SM_CEPH_MON_STATE_FILE}" ]]; then
|
||||
if [[ ${new_state} == "${SM_CEPH_MON_CURRENT_STATE}" ]]; then
|
||||
return
|
||||
fi
|
||||
SM_CEPH_MON_CURRENT_STATE=${new_state}
|
||||
elif [[ ${state_file} == "${SM_CEPH_OSD_STATE_FILE}" ]]; then
|
||||
if [[ ${new_state} == "${SM_CEPH_OSD_CURRENT_STATE}" ]]; then
|
||||
return
|
||||
fi
|
||||
SM_CEPH_OSD_CURRENT_STATE=${new_state}
|
||||
fi
|
||||
|
||||
echo "${new_state}" > "${state_file}"
|
||||
log INFO "Updating ${state_file} state to ${new_state}."
|
||||
else
|
||||
ppid_name=$(cat /proc/${PPID}/comm)
|
||||
log WARN "Cannot save state to ${state_file}. Expected 'sm' ppid, found '${ppid_name}'."
|
||||
fi
|
||||
}
|
||||
|
||||
# Sanity check for the SM_CEPH_MON_CURRENT_STATE variable
|
||||
if is_ppid_sm && [ "${SM_CEPH_MON_CURRENT_STATE}" != "${STATE_RUNNING}" ]; then
|
||||
timeout 5 sm-query service ceph-mon 2>/dev/null | grep -q enabled-active
|
||||
if [ $? -eq 0 ]; then
|
||||
save_state ${SM_CEPH_MON_STATE_FILE} ${STATE_RUNNING}
|
||||
else
|
||||
save_state ${SM_CEPH_MON_STATE_FILE} ${STATE_STOPPED}
|
||||
fi
|
||||
fi
|
||||
|
||||
# Sanity check for the SM_CEPH_OSD_CURRENT_STATE variable
|
||||
if is_ppid_sm && [ "${SM_CEPH_OSD_CURRENT_STATE}" != "${STATE_RUNNING}" ]; then
|
||||
timeout 5 sm-query service ceph-osd 2>/dev/null | grep -q enabled-active
|
||||
if [ $? -eq 0 ]; then
|
||||
save_state ${SM_CEPH_OSD_STATE_FILE} ${STATE_RUNNING}
|
||||
else
|
||||
save_state ${SM_CEPH_OSD_STATE_FILE} ${STATE_STOPPED}
|
||||
fi
|
||||
fi
|
||||
|
||||
# Log Management
|
||||
# Adding PID and PPID informations
|
||||
log () {
|
||||
@@ -346,6 +411,13 @@ start ()
|
||||
|
||||
# Start the service
|
||||
with_service_lock "${service}" ${CEPH_SCRIPT} start ${service}
|
||||
|
||||
if [ "$RC" -eq 0 ] && [[ "${service}" == *"mon"* ]]; then
|
||||
save_state ${SM_CEPH_MON_STATE_FILE} ${STATE_RUNNING}
|
||||
elif [ "$RC" -eq 0 ] && [[ "${service}" == *"osd"* ]]; then
|
||||
save_state ${SM_CEPH_OSD_STATE_FILE} ${STATE_RUNNING}
|
||||
fi
|
||||
|
||||
log INFO "Ceph START ${service} command finished."
|
||||
}
|
||||
|
||||
@@ -368,6 +440,13 @@ stop ()
|
||||
fi
|
||||
|
||||
with_service_lock "${service}" ${CEPH_SCRIPT} ${cmd} ${service}
|
||||
|
||||
if [ "$RC" -eq 0 ] && [[ "${service}" == *"mon"* ]]; then
|
||||
save_state ${SM_CEPH_MON_STATE_FILE} ${STATE_STOPPED}
|
||||
elif [ "$RC" -eq 0 ] && [[ "${service}" == *"osd"* ]]; then
|
||||
save_state ${SM_CEPH_OSD_STATE_FILE} ${STATE_STOPPED}
|
||||
fi
|
||||
|
||||
log INFO "Ceph ${cmd^^} ${service} command finished."
|
||||
}
|
||||
|
||||
@@ -448,12 +527,26 @@ status ()
|
||||
eval target="$target"
|
||||
[ -z "${target}" ] && target="mon osd"
|
||||
|
||||
log INFO "status ${target}"
|
||||
|
||||
if [ ! -f ${CEPH_FILE} ]; then
|
||||
# Ceph is not running on this node, return success
|
||||
log INFO "Ceph is not running on this node, returning success."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log INFO "status ${target}";
|
||||
if is_ppid_sm; then
|
||||
if [[ "${target}" == *"mon"* ]] && \
|
||||
[[ "${SM_CEPH_MON_CURRENT_STATE}" != "${STATE_RUNNING}" ]]; then
|
||||
log INFO "Ceph Mon is masked as ${SM_CEPH_MON_CURRENT_STATE} state, returning exit 1."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${target}" == *"osd"* ]] && \
|
||||
[[ "${SM_CEPH_OSD_CURRENT_STATE}" != "${STATE_RUNNING}" ]]; then
|
||||
log INFO "Ceph OSD is masked as ${SM_CEPH_OSD_CURRENT_STATE} state, returning exit 1."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$system_type" == "All-in-one" ]] && [[ "$system_mode" != "simplex" ]] && [[ "$target" == "osd" ]]; then
|
||||
has_ceph_network_carrier
|
||||
|
||||
@@ -46,7 +46,7 @@ save_state()
|
||||
}
|
||||
|
||||
RETVAL=0
|
||||
STATE_FILE="/var/run/ceph/.storage-networking-state"
|
||||
STATE_FILE="/var/run/ceph/.sm-storage-networking-state"
|
||||
STATE_RUNNING="Running"
|
||||
STATE_STOPPED="Stopped"
|
||||
CURRENT_STATE=$(cat ${STATE_FILE} 2>/dev/null)
|
||||
|
||||
Reference in New Issue
Block a user