Preserving ceph-storage-network current state

The storage-networking SM service has been presenting a state mismatch
in some network recovery scenarios, where it is sometimes expected
to be disabled by the SM, but reports enabled-active based on Ceph's
network availability.

In this change, the proposal is to save the current SM state in the
/var/run/ceph/.storage-networking-state file and return enabled-active
only if the SM service is running and the Ceph network interface
has carrier. In other cases, it is possible to return disabled
without stopping the Ceph services.

Other code improvements were made, including a reinforcement that the
script should only run on AIO-DX systems.

Test Plan (AIO-DX):
  PASS: Fresh install
  PASS: MTC operations: swact and lock-unlock.
  PASS: On AIO-DX, reproduce a split-brain by shutting down the
        interfaces and powering them back on after 6 min, and confirm
        if the service storage-networking audit was successful.

Closes-bug: 2111609

Change-Id: Ic799be831bab57a6d45f5306bd337c858b2889f8
Signed-off-by: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com>
This commit is contained in:
Hediberto C Silva
2025-05-19 10:15:05 -03:00
parent e25eb24d6c
commit 08368db377

View File

@ -1,6 +1,6 @@
#!/bin/bash
#
# Copyright (c) 2024 Wind River Systems, Inc.
# Copyright (c) 2024-2025 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
@ -10,7 +10,6 @@
source /etc/platform/platform.conf
CEPH_FILE="/var/run/.ceph_started"
CEPH_SCRIPT="/etc/init.d/ceph-init-wrapper"
source /usr/lib/ceph/ceph_common.sh
@ -40,40 +39,60 @@ log () {
return 0
}
identify_ceph_network_interface() {
if [ "${ceph_network}" == "mgmt" ]; then
ceph_network_interface="${management_interface}"
return 0
fi
if [ "${ceph_network}" == "cluster-host" ]; then
ceph_network_interface="${cluster_host_interface}"
return 0
fi
return 1
save_state()
{
[ ! -z "$1" ] && CURRENT_STATE="$1"
echo ${CURRENT_STATE} > ${STATE_FILE}
}
RETVAL=0
STATE_FILE="/var/run/ceph/.storage-networking-state"
STATE_RUNNING="Running"
STATE_STOPPED="Stopped"
CURRENT_STATE=$(cat ${STATE_FILE} 2>/dev/null)
# Sanity check for the CURRENT_STATE variable
if [ "${CURRENT_STATE}" != "${STATE_RUNNING}" ]; then
save_state ${STATE_STOPPED}
fi
################################################################################
# Stop Ceph Services
# Start Service
################################################################################
start()
{
log INFO "Start ceph-storage-network service"
[ "${CURRENT_STATE}" == "${STATE_RUNNING}" ] && return
status
STATUS_RETURN=$?
if [ ${STATUS_RETURN} -eq 0 ]; then
save_state ${STATE_RUNNING}
RETVAL=0
else
save_state ${STATE_STOPPED}
fi
}
################################################################################
# Stop Service
################################################################################
stop()
{
# This script should run only in AIO-DX called by sm. Double check it.
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
services="osd mds mon.controller"
else
services="osd mds mon"
fi
log INFO "Stop ceph-storage-network service"
# sequentially stopping ceph-osd, ceph-mds, then ceph-mon
for service in $services; do
local services="osd mds mon.controller"
# sequentially stopping ceph-osd, ceph-mds, then the float monitor
for service in ${services}; do
log INFO "Force stopping ceph services"
${CEPH_SCRIPT} forcestop ${service}
done
[ "${CURRENT_STATE}" == "${STATE_RUNNING}" ] && save_state ${STATE_STOPPED}
return
}
@ -85,15 +104,15 @@ has_ceph_network_carrier()
{
# Checks the carrier (cable connected) for Ceph network interface
# If no-carrier is detected, then the interface has no physical link
eval local interface=\$${ceph_network}_interface
if [ -z ${interface} ]; then
eval local INTERFACE=\$${ceph_network}_interface
if [ -z "${INTERFACE}" ]; then
log ERROR "Cannot detect Ceph network. Skipping network carrier detection"
return 0
fi
ip link show "${interface}" | grep NO-CARRIER
ip link show "${INTERFACE}" | grep NO-CARRIER
if [ $? -eq 0 ]; then
log INFO "Ceph network '${interface}' has NO-CARRIER, cannot start ceph-mon"
log INFO "Ceph network '${INTERFACE}' has NO-CARRIER, cannot start ceph-mon"
return 1
fi
return 0
@ -101,34 +120,39 @@ has_ceph_network_carrier()
status()
{
if [ ! -f ${CEPH_FILE} ]; then
# Ceph is not running on this node, return success
return
fi
has_ceph_network_carrier
if [ $? -ne 0 ]; then
# communication failure detected
# stopping ceph services to avoid data corruption
stop
HAS_CARRIER=$?
if [ "${CURRENT_STATE}" == "${STATE_RUNNING}" ] && [ ${HAS_CARRIER} -eq 0 ]; then
# Service is "running" and has carrier.
RETVAL=0
else
# Force stop services only if carrier is not detected.
[ ${HAS_CARRIER} -ne 0 ] && stop
RETVAL=1
fi
return
# NOTE: The Status return is only used in the Start method to validate that there
# is a carrier on the Ceph network before stating that the SM service is Running.
return ${HAS_CARRIER}
}
################################################################################
# Main Entry
################################################################################
# This script should run only in AIO-DX called by sm
if [ "${system_type}" != "All-in-one" ] || [ "${system_mode}" == "simplex" ]; then
log WARN "This script must be called only from All-in-one duplex."
exit 0
fi
case "$1" in
start)
status
start
;;
stop)
RETVAL=0
stop
;;
status)
status
@ -139,5 +163,4 @@ case "$1" in
;;
esac
exit $RETVAL
exit ${RETVAL}