integ/ceph/ceph/files/ceph-storage-network.sh
Hediberto C Silva d5a84a1dbc Adds support for 3 monitors on AIO-DX
Adding two fixed Ceph monitors for controllers when deploying an
AIO-DX to improve HA.

Includes:
 - Creates a new shell script to manage the fixed monitors
 - Creates a new patch to include the mon_data parameter in
   the mon.pp puppet manifest.
 - Creates the ceph-storage-network script that is used by the
   storage-networking SM service to stop Ceph services in case
   of a network outage.
 - Alters the script used by the ceph-mds pmon file to
   ceph-init-wrapper.
 - Adjusts the ceph-init-wrapper to accept commands from pmon service.
 - Adjusts the ceph-init-wrapper to accept the forcestop command.
 - Stopping Ceph services using ceph-init-wrapper, it is checked
   if the pid exists before trying.
 - Stopping ceph-mon service using ceph-init-wrapper,
   the ceph-mds is stopped right before to force a re-peering.
 - Starting ceph-mon service using ceph-init-wrapper,
   the ceph-mds is stopped right before to force a re-peering.
 - Starting ceph-mds, it is checked if the ceph-mon is operational.
 - The forcestop command uses a TERM signal first before
   attempting a KILL signal after 5 seconds.

Test Plan:
  PASS: Fresh install AIO-DX and check 3 Ceph monitors are running.
  PASS: Fresh install all other setups and check if Ceph is working as
expected.
  PASS: Reboots the standby controller and check if Ceph is still
running.
  PASS: Reboots the active controller. Ceph will stop responding, but
it will recover after both controllers are running.
  PASS: Verify Ceph is working after a DOR test with PODs writting
to the cephfs and rbd pools.
  PASS: Verify Ceph is resilient to switch reboots

Story: 2011122
Task: 50129

Change-Id: I18d7ab9da3303265da34bc13c8be4baa23c2a7be
Signed-off-by: Hediberto C Silva <hediberto.cavalcantedasilva@windriver.com>
Signed-off-by: Felipe Sanches Zanoni <Felipe.SanchesZanoni@windriver.com>
2024-07-29 17:42:29 -03:00

144 lines
3.5 KiB
Bash
Executable File

#!/bin/bash
#
# Copyright (c) 2024 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
# This script monitors the Ceph network for carrier on an AIO-DX system.
# To prevent data corruption, when there is no carrier from the Ceph network,
# the floating monitor, the osds and the mds processes will be stopped.
source /etc/platform/platform.conf
CEPH_FILE="/var/run/.ceph_started"
CEPH_SCRIPT="/etc/init.d/ceph-init-wrapper"
source /usr/lib/ceph/ceph_common.sh
LOG_PATH=/var/log/ceph
LOG_FILE=$LOG_PATH/ceph-process-states.log
LOG_LEVEL=NORMAL # DEBUG
# Log Management
# Adding PID and PPID informations
log () {
local name=""
local log_level="$1"
# Checking if the first parameter is not a log level
if grep -q -v ${log_level} <<< "INFO DEBUG WARN ERROR"; then
name=" ($1)";
log_level="$2"
shift
fi
shift
local message="$@"
# prefix = <pid_subshell> <ppid_name>[<ppid>] <name|optional>
local prefix="${BASHPID} $(cat /proc/${PPID}/comm)[${PPID}]${name}"
# yyyy-MM-dd HH:mm:ss.SSSSSS /etc/init.d/ceph-storage-network <prefix> <log_level>: <message>
wlog "${prefix}" "${log_level}" "${message}"
return 0
}
identify_ceph_network_interface() {
if [ "${ceph_network}" == "mgmt" ]; then
ceph_network_interface="${management_interface}"
return 0
fi
if [ "${ceph_network}" == "cluster-host" ]; then
ceph_network_interface="${cluster_host_interface}"
return 0
fi
return 1
}
RETVAL=0
################################################################################
# Stop Ceph Services
################################################################################
stop()
{
# This script should run only in AIO-DX called by sm. Double check it.
if [ "${system_type}" == "All-in-one" ] && [ "${system_mode}" != "simplex" ]; then
services="osd mds mon.controller"
else
services="osd mds mon"
fi
# sequentially stopping ceph-osd, ceph-mds, then ceph-mon
for service in $services; do
${CEPH_SCRIPT} forcestop ${service}
done
return
}
################################################################################
# Status Action
################################################################################
has_ceph_network_carrier()
{
# Checks the carrier (cable connected) for Ceph network interface
# If no-carrier is detected, then the interface has no physical link
eval local interface=\$${ceph_network}_interface
if [ -z ${interface} ]; then
log ERROR "Cannot detect Ceph network. Skipping network carrier detection"
return 0
fi
ip link show "${interface}" | grep NO-CARRIER
if [ $? -eq 0 ]; then
log INFO "Ceph network '${interface}' has NO-CARRIER, cannot start ceph-mon"
return 1
fi
return 0
}
status()
{
if [ ! -f ${CEPH_FILE} ]; then
# Ceph is not running on this node, return success
return
fi
has_ceph_network_carrier
if [ $? -ne 0 ]; then
# communication failure detected
# stopping ceph services to avoid data corruption
stop
RETVAL=1
fi
return
}
################################################################################
# Main Entry
################################################################################
case "$1" in
start)
status
;;
stop)
RETVAL=0
;;
status)
status
;;
*)
echo "usage: $0 { start | stop | status }"
exit 1
;;
esac
exit $RETVAL