Preserving ceph-storage-network current state
The storage-networking SM service has been presenting a state mismatch in some network recovery scenarios, where it is sometimes expected to be disabled by the SM, but reports enabled-active based on Ceph's network availability. In this change, the proposal is to save the current SM state in the /var/run/ceph/.storage-networking-state file and return enabled-active only if the SM service is running and the Ceph network interface has carrier. In other cases, it is possible to return disabled without stopping the Ceph services. Other code improvements were made, including a reinforcement that the script should only run on AIO-DX systems. Test Plan (AIO-DX): PASS: Fresh install PASS: MTC operations: swact and lock-unlock. PASS: On AIO-DX, reproduce a split-brain by shutting down the interfaces and powering them back on after 6 min, and confirm if the service storage-networking audit was successful. Closes-bug: 2111609 Change-Id: Ic799be831bab57a6d45f5306bd337c858b2889f8 Signed-off-by: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com>
This commit is contained in:
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Copyright (c) 2024 Wind River Systems, Inc.
|
||||
# Copyright (c) 2024-2025 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
@ -10,7 +10,6 @@
|
||||
|
||||
source /etc/platform/platform.conf
|
||||
|
||||
CEPH_FILE="/var/run/.ceph_started"
|
||||
CEPH_SCRIPT="/etc/init.d/ceph-init-wrapper"
|
||||
|
||||
source /usr/lib/ceph/ceph_common.sh
|
||||
@ -40,40 +39,60 @@ log () {
|
||||
return 0
|
||||
}
|
||||
|
||||
identify_ceph_network_interface() {
|
||||
if [ "${ceph_network}" == "mgmt" ]; then
|
||||
ceph_network_interface="${management_interface}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "${ceph_network}" == "cluster-host" ]; then
|
||||
ceph_network_interface="${cluster_host_interface}"
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
save_state()
|
||||
{
|
||||
[ ! -z "$1" ] && CURRENT_STATE="$1"
|
||||
echo ${CURRENT_STATE} > ${STATE_FILE}
|
||||
}
|
||||
|
||||
RETVAL=0
|
||||
STATE_FILE="/var/run/ceph/.storage-networking-state"
|
||||
STATE_RUNNING="Running"
|
||||
STATE_STOPPED="Stopped"
|
||||
CURRENT_STATE=$(cat ${STATE_FILE} 2>/dev/null)
|
||||
|
||||
# Sanity check for the CURRENT_STATE variable
|
||||
if [ "${CURRENT_STATE}" != "${STATE_RUNNING}" ]; then
|
||||
save_state ${STATE_STOPPED}
|
||||
fi
|
||||
|
||||
################################################################################
|
||||
# Stop Ceph Services
|
||||
# Start Service
|
||||
################################################################################
|
||||
|
||||
start()
|
||||
{
|
||||
log INFO "Start ceph-storage-network service"
|
||||
[ "${CURRENT_STATE}" == "${STATE_RUNNING}" ] && return
|
||||
|
||||
status
|
||||
STATUS_RETURN=$?
|
||||
if [ ${STATUS_RETURN} -eq 0 ]; then
|
||||
save_state ${STATE_RUNNING}
|
||||
RETVAL=0
|
||||
else
|
||||
save_state ${STATE_STOPPED}
|
||||
fi
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Stop Service
|
||||
################################################################################
|
||||
|
||||
stop()
|
||||
{
|
||||
# This script should run only in AIO-DX called by sm. Double check it.
|
||||
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
|
||||
services="osd mds mon.controller"
|
||||
else
|
||||
services="osd mds mon"
|
||||
fi
|
||||
log INFO "Stop ceph-storage-network service"
|
||||
|
||||
# sequentially stopping ceph-osd, ceph-mds, then ceph-mon
|
||||
for service in $services; do
|
||||
local services="osd mds mon.controller"
|
||||
|
||||
# sequentially stopping ceph-osd, ceph-mds, then the float monitor
|
||||
for service in ${services}; do
|
||||
log INFO "Force stopping ceph services"
|
||||
${CEPH_SCRIPT} forcestop ${service}
|
||||
done
|
||||
|
||||
[ "${CURRENT_STATE}" == "${STATE_RUNNING}" ] && save_state ${STATE_STOPPED}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@ -85,15 +104,15 @@ has_ceph_network_carrier()
|
||||
{
|
||||
# Checks the carrier (cable connected) for Ceph network interface
|
||||
# If no-carrier is detected, then the interface has no physical link
|
||||
eval local interface=\$${ceph_network}_interface
|
||||
if [ -z ${interface} ]; then
|
||||
eval local INTERFACE=\$${ceph_network}_interface
|
||||
if [ -z "${INTERFACE}" ]; then
|
||||
log ERROR "Cannot detect Ceph network. Skipping network carrier detection"
|
||||
return 0
|
||||
fi
|
||||
|
||||
ip link show "${interface}" | grep NO-CARRIER
|
||||
ip link show "${INTERFACE}" | grep NO-CARRIER
|
||||
if [ $? -eq 0 ]; then
|
||||
log INFO "Ceph network '${interface}' has NO-CARRIER, cannot start ceph-mon"
|
||||
log INFO "Ceph network '${INTERFACE}' has NO-CARRIER, cannot start ceph-mon"
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
@ -101,34 +120,39 @@ has_ceph_network_carrier()
|
||||
|
||||
status()
|
||||
{
|
||||
if [ ! -f ${CEPH_FILE} ]; then
|
||||
# Ceph is not running on this node, return success
|
||||
return
|
||||
fi
|
||||
|
||||
has_ceph_network_carrier
|
||||
if [ $? -ne 0 ]; then
|
||||
# communication failure detected
|
||||
# stopping ceph services to avoid data corruption
|
||||
stop
|
||||
HAS_CARRIER=$?
|
||||
|
||||
if [ "${CURRENT_STATE}" == "${STATE_RUNNING}" ] && [ ${HAS_CARRIER} -eq 0 ]; then
|
||||
# Service is "running" and has carrier.
|
||||
RETVAL=0
|
||||
else
|
||||
# Force stop services only if carrier is not detected.
|
||||
[ ${HAS_CARRIER} -ne 0 ] && stop
|
||||
RETVAL=1
|
||||
fi
|
||||
|
||||
return
|
||||
# NOTE: The Status return is only used in the Start method to validate that there
|
||||
# is a carrier on the Ceph network before stating that the SM service is Running.
|
||||
return ${HAS_CARRIER}
|
||||
}
|
||||
|
||||
################################################################################
|
||||
|
||||
# Main Entry
|
||||
|
||||
################################################################################
|
||||
|
||||
# This script should run only in AIO-DX called by sm
|
||||
if [ "${system_type}" != "All-in-one" ] || [ "${system_mode}" == "simplex" ]; then
|
||||
log WARN "This script must be called only from All-in-one duplex."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
start)
|
||||
status
|
||||
start
|
||||
;;
|
||||
stop)
|
||||
RETVAL=0
|
||||
stop
|
||||
;;
|
||||
status)
|
||||
status
|
||||
@ -139,5 +163,4 @@ case "$1" in
|
||||
;;
|
||||
esac
|
||||
|
||||
exit $RETVAL
|
||||
|
||||
exit ${RETVAL}
|
||||
|
Reference in New Issue
Block a user