
Adding two fixed Ceph monitors for controllers when deploying an AIO-DX to improve HA. Includes: - Creates a new shell script to manage the fixed monitors - Creates a new patch to include the mon_data parameter in the mon.pp puppet manifest. - Creates the ceph-storage-network script that is used by the storage-networking SM service to stop Ceph services in case of a network outage. - Alters the script used by the ceph-mds pmon file to ceph-init-wrapper. - Adjusts the ceph-init-wrapper to accept commands from pmon service. - Adjusts the ceph-init-wrapper to accept the forcestop command. - Stopping Ceph services using ceph-init-wrapper, it is checked if the pid exists before trying. - Stopping ceph-mon service using ceph-init-wrapper, the ceph-mds is stopped right before to force a re-peering. - Starting ceph-mon service using ceph-init-wrapper, the ceph-mds is stopped right before to force a re-peering. - Starting ceph-mds, it is checked if the ceph-mon is operational. - The forcestop command uses a TERM signal first before attempting a KILL signal after 5 seconds. Test Plan: PASS: Fresh install AIO-DX and check 3 Ceph monitors are running. PASS: Fresh install all other setups and check if Ceph is working as expected. PASS: Reboots the standby controller and check if Ceph is still running. PASS: Reboots the active controller. Ceph will stop responding, but it will recover after both controllers are running. PASS: Verify Ceph is working after a DOR test with PODs writting to the cephfs and rbd pools. PASS: Verify Ceph is resilient to switch reboots Story: 2011122 Task: 50129 Change-Id: I18d7ab9da3303265da34bc13c8be4baa23c2a7be Signed-off-by: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com> Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
144 lines
3.5 KiB
Bash
Executable File
144 lines
3.5 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Copyright (c) 2024 Wind River Systems, Inc.
|
|
#
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# This script monitors the Ceph network for carrier on an AIO-DX system.
|
|
# To prevent data corruption, when there is no carrier from the Ceph network,
|
|
# the floating monitor, the osds and the mds processes will be stopped.
|
|
|
|
source /etc/platform/platform.conf
|
|
|
|
CEPH_FILE="/var/run/.ceph_started"
|
|
CEPH_SCRIPT="/etc/init.d/ceph-init-wrapper"
|
|
|
|
source /usr/lib/ceph/ceph_common.sh
|
|
LOG_PATH=/var/log/ceph
|
|
LOG_FILE=$LOG_PATH/ceph-process-states.log
|
|
LOG_LEVEL=NORMAL # DEBUG
|
|
|
|
# Log Management
|
|
# Adding PID and PPID informations
|
|
log () {
|
|
local name=""
|
|
local log_level="$1"
|
|
# Checking if the first parameter is not a log level
|
|
if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
|
|
name=" ($1)";
|
|
log_level="$2"
|
|
shift
|
|
fi
|
|
|
|
shift
|
|
|
|
local message="$@"
|
|
# prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
|
|
local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
|
|
# yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph-storage-network <prefix> <log_level>: <message>
|
|
wlog "${prefix}" "${log_level}" "${message}"
|
|
return 0
|
|
}
|
|
|
|
identify_ceph_network_interface() {
|
|
if [ "${ceph_network}" == "mgmt" ]; then
|
|
ceph_network_interface="${management_interface}"
|
|
return 0
|
|
fi
|
|
|
|
if [ "${ceph_network}" == "cluster-host" ]; then
|
|
ceph_network_interface="${cluster_host_interface}"
|
|
return 0
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
RETVAL=0
|
|
|
|
################################################################################
|
|
# Stop Ceph Services
|
|
################################################################################
|
|
|
|
stop()
|
|
{
|
|
# This script should run only in AIO-DX called by sm. Double check it.
|
|
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
|
|
services="osd mds mon.controller"
|
|
else
|
|
services="osd mds mon"
|
|
fi
|
|
|
|
# sequentially stopping ceph-osd, ceph-mds, then ceph-mon
|
|
for service in $services; do
|
|
${CEPH_SCRIPT} forcestop ${service}
|
|
done
|
|
|
|
return
|
|
}
|
|
|
|
################################################################################
|
|
# Status Action
|
|
################################################################################
|
|
|
|
has_ceph_network_carrier()
|
|
{
|
|
# Checks the carrier (cable connected) for Ceph network interface
|
|
# If no-carrier is detected, then the interface has no physical link
|
|
eval local interface=\$${ceph_network}_interface
|
|
if [ -z ${interface} ]; then
|
|
log ERROR "Cannot detect Ceph network. Skipping network carrier detection"
|
|
return 0
|
|
fi
|
|
|
|
ip link show "${interface}" | grep NO-CARRIER
|
|
if [ $? -eq 0 ]; then
|
|
log INFO "Ceph network '${interface}' has NO-CARRIER, cannot start ceph-mon"
|
|
return 1
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
status()
|
|
{
|
|
if [ ! -f ${CEPH_FILE} ]; then
|
|
# Ceph is not running on this node, return success
|
|
return
|
|
fi
|
|
|
|
has_ceph_network_carrier
|
|
if [ $? -ne 0 ]; then
|
|
# communication failure detected
|
|
# stopping ceph services to avoid data corruption
|
|
stop
|
|
RETVAL=1
|
|
fi
|
|
|
|
return
|
|
}
|
|
|
|
################################################################################
|
|
|
|
# Main Entry
|
|
|
|
################################################################################
|
|
|
|
case "$1" in
|
|
start)
|
|
status
|
|
;;
|
|
stop)
|
|
RETVAL=0
|
|
;;
|
|
status)
|
|
status
|
|
;;
|
|
*)
|
|
echo "usage: $0 { start | stop | status }"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
exit $RETVAL
|
|
|