Merge "AIO-DX swact task affinity robustness"
This commit is contained in:
commit
931887828b
13
utilities/worker-utils/worker-utils/affine-tasks.sh
Normal file → Executable file
13
utilities/worker-utils/worker-utils/affine-tasks.sh
Normal file → Executable file
@ -68,6 +68,8 @@ LNAME=$(readlink -n -f $0)
|
||||
NAME=$(basename $LNAME)
|
||||
PIDFILE=/var/run/${NAME}.pid
|
||||
|
||||
TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
|
||||
|
||||
# Define number of logical cpus
|
||||
LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN)
|
||||
|
||||
@ -89,6 +91,11 @@ else
|
||||
NONISOL_CPUS=${ONLINE_CPUS}
|
||||
NONISOL_MASK=${ONLINE_MASK}
|
||||
fi
|
||||
# NONISOL_CPULIST is a space separated list, consumed by SM so that
|
||||
# it knows about extra available cores
|
||||
NONISOL_CPULIST=$(echo ${NONISOL_CPUS} | \
|
||||
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
||||
sed 's/,/ /g')
|
||||
|
||||
# Define platform memory nodeset and cpuset
|
||||
PLATFORM_NODES=$(cat /sys/devices/system/node/online)
|
||||
@ -404,7 +411,8 @@ function affine_drbd_tasks {
|
||||
}
|
||||
|
||||
# Return list of reaffineable pids. This includes all processes, but excludes
|
||||
# kernel threads, vSwitch, and anything in K8S, docker or qemu/kvm cpuset.
|
||||
# kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker,
|
||||
# and machine.slice (i.e., qemu-kvm).
|
||||
function reaffineable_pids {
|
||||
local pids_excl
|
||||
local pidlist
|
||||
@ -433,6 +441,8 @@ function affine_tasks_to_all_cores {
|
||||
${NONISOL_CPUS} ${pid} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
|
||||
echo ${NONISOL_CPULIST} > ${TASK_AFFINING_INCOMPLETE}
|
||||
LOG "Affined ${count} processes to all cores."
|
||||
}
|
||||
|
||||
@ -472,6 +482,7 @@ function affine_tasks_to_platform_cores {
|
||||
taskset --pid --cpu-list 0 ${pid} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
rm -v -f ${TASK_AFFINING_INCOMPLETE}
|
||||
LOG "Affined ${count} processes to platform cores."
|
||||
}
|
||||
|
||||
|
@ -19,19 +19,16 @@
|
||||
PATH=/bin:/usr/bin:/usr/local/bin
|
||||
|
||||
. /etc/platform/platform.conf
|
||||
LOG_FUNCTIONS=${LOG_FUNCTIONS:-"/etc/init.d/log_functions.sh"}
|
||||
CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"}
|
||||
[[ -e ${LOG_FUNCTIONS} ]] && source ${LOG_FUNCTIONS}
|
||||
[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS}
|
||||
|
||||
# Enable debug logs and tag them
|
||||
LOG_DEBUG=1
|
||||
TAG="TASKAFFINITY:"
|
||||
|
||||
TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
|
||||
N_CPUS=$(getconf _NPROCESSORS_ONLN)
|
||||
FULLSET_CPUS="0-"$((N_CPUS-1))
|
||||
FULLSET_MASK=$(cpulist_to_cpumap ${FULLSET_CPUS} ${N_CPUS})
|
||||
|
||||
# The following CPULISTs are space separated lists of logical cpus,
|
||||
# and are used by helper functions.
|
||||
ISOL_CPULIST=$(/bin/cat /sys/devices/system/cpu/isolated | \
|
||||
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
||||
sed 's/,/ /g')
|
||||
PLATFORM_CPUS=$(platform_expanded_cpu_list)
|
||||
PLATFORM_CPULIST=$(platform_expanded_cpu_list| \
|
||||
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
||||
@ -43,8 +40,26 @@ if [[ $vswitch_type =~ none ]]; then
|
||||
VSWITCH_CPULIST=""
|
||||
fi
|
||||
|
||||
IDLE_MARK=95.0
|
||||
KERNEL=$(uname -a)
|
||||
PIDFILE=/var/run/affine-tasks.sh.pid
|
||||
|
||||
# Idle cpu occupancy threshold; logical cpus with greater idle occupancy
|
||||
# than this will be included.
|
||||
IDLEOCC_THRESHOLD=95.0
|
||||
|
||||
# Watch timeout to monitor removal of flag file; this is engineered as
|
||||
# 2x the typical duration of a swact.
|
||||
WATCH_TIMEOUT_SECONDS=90
|
||||
|
||||
# Log info message to /var/log/daemon.log
|
||||
NAME="task-affine-functions"
|
||||
LOG_FILE=/tmp/task-affine-functions.log
|
||||
function LOG {
|
||||
logger -p daemon.info -t "${NAME}($$): " "$@"
|
||||
if [ ! -z "${LOG_FILE}" ]; then
|
||||
local tstamp_H=$( date +"%Y-%0m-%0eT%H:%M:%S" )
|
||||
echo -e "${tstamp_H} ${HOSTNAME} $0($$): info $@" >> ${LOG_FILE}
|
||||
fi
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Check if a given core is one of the platform cores
|
||||
@ -72,8 +87,22 @@ function is_vswitch_core {
|
||||
return 0
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# Check if a given core is one of the isolcpus cores
|
||||
################################################################################
|
||||
function is_isolcpus_core {
|
||||
local core=$1
|
||||
for CPU in ${ISOL_CPULIST}; do
|
||||
if [ $core -eq $CPU ]; then
|
||||
return 1
|
||||
fi
|
||||
done
|
||||
return 0
|
||||
}
|
||||
|
||||
# Return list of reaffineable pids. This includes all processes, but excludes
|
||||
# kernel threads, vSwitch, and anything in K8S or qemu/kvm.
|
||||
# kernel threads, vSwitch, and anything in the cgroup cpusets: k8s-infra, docker,
|
||||
# and machine.slice (i.e., qemu-kvm).
|
||||
function reaffineable_pids {
|
||||
local pids_excl
|
||||
local pidlist
|
||||
@ -83,7 +112,7 @@ function reaffineable_pids {
|
||||
sed 's/,$/\n/')
|
||||
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
|
||||
-o pid=,cgroup= | \
|
||||
awk '!/k8s-infra|machine.slice/ {print $1; }')
|
||||
awk '!/k8s-infra|docker|machine.slice/ {print $1; }')
|
||||
echo "${pidlist[@]}"
|
||||
}
|
||||
|
||||
@ -93,7 +122,7 @@ function reaffineable_pids {
|
||||
# critical and cpu intensive operation in AIO. For instance, sm can levearage
|
||||
# the idle cores to speed up swact activity.
|
||||
#
|
||||
# At the end of the operation, regarless of the result, the service must be
|
||||
# At the end of the operation, regardless of the result, the service must be
|
||||
# calling function affine_tasks_to_platform_cores to re-affine platform tasks
|
||||
# back to their assigned core(s).
|
||||
#
|
||||
@ -101,61 +130,92 @@ function reaffineable_pids {
|
||||
################################################################################
|
||||
function affine_tasks_to_idle_cores {
|
||||
local cpulist
|
||||
local cpuocc_list
|
||||
local vswitch_pid
|
||||
local pidlist
|
||||
local idle_cpulist
|
||||
local platform_cpus
|
||||
local count=0
|
||||
local rc=0
|
||||
local cpu=0
|
||||
|
||||
# Keep the last invocation of affining, truncate when we use idle cores
|
||||
:> ${LOG_FILE}
|
||||
|
||||
# Ensure this only runs on AIO
|
||||
if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; }
|
||||
then
|
||||
LOG "Not AIO, nothing to do."
|
||||
return $rc
|
||||
fi
|
||||
|
||||
if [ -f ${TASK_AFFINING_INCOMPLETE} ]; then
|
||||
read cpulist < ${TASK_AFFINING_INCOMPLETE}
|
||||
log_debug "${TAG} Tasks have already been affined to CPU ($cpulist)."
|
||||
return 0
|
||||
LOG "Tasks have already been affined to CPU ($cpulist)."
|
||||
return $rc
|
||||
fi
|
||||
|
||||
if [[ "${KERNEL}" == *" RT "* ]]; then
|
||||
return 0
|
||||
fi
|
||||
# Get idle cpu occupancy of all logical cores in the last 5 seconds.
|
||||
declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') )
|
||||
|
||||
# Compile a list of cpus with idle percentage greater than 95% in the last
|
||||
# 5 seconds.
|
||||
cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}'))
|
||||
|
||||
for idle_value in ${cpuocc_list[@]}; do
|
||||
# Determine logical cpus that are considered platform, or application
|
||||
# cores with idle percentage greater than 95%.
|
||||
declare -a idle_cpus=()
|
||||
for cpu in ${!cpuocc_list[@]}; do
|
||||
idleocc=${cpuocc_list[$cpu]}
|
||||
is_vswitch_core $cpu
|
||||
if [ $? -eq 1 ]; then
|
||||
cpu=$(($cpu+1))
|
||||
continue
|
||||
fi
|
||||
|
||||
is_isolcpus_core $cpu
|
||||
if [ $? -eq 1 ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
is_platform_core $cpu
|
||||
if [ $? -eq 1 ]; then
|
||||
# Platform core is added to the idle list by default
|
||||
idle_cpulist=$idle_cpulist$cpu","
|
||||
idle_cpus+=( ${cpu} )
|
||||
else
|
||||
# Non platform core is added to the idle list if it is more
|
||||
# than 95% idle
|
||||
if [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]]; then
|
||||
idle_cpulist=$idle_cpulist$cpu","
|
||||
if [[ $(echo "${idleocc} > ${IDLEOCC_THRESHOLD}" | bc) -eq 1 ]]; then
|
||||
idle_cpus+=( ${cpu} )
|
||||
fi
|
||||
fi
|
||||
cpu=$(($cpu+1))
|
||||
done
|
||||
|
||||
idle_cpulist=$(echo $idle_cpulist|sed 's/.$//')
|
||||
# comma separated list of idle cpus
|
||||
idle_cpulist=$(printf '%s,' "${idle_cpus[@]}")
|
||||
idle_cpulist=${idle_cpulist%,}
|
||||
|
||||
log_debug "${TAG} Affining all tasks to idle CPU ($idle_cpulist)"
|
||||
LOG "Affining all tasks to idle CPU ($idle_cpulist)"
|
||||
pidlist=( $(reaffineable_pids) )
|
||||
for pid in ${pidlist[@]}; do
|
||||
count=$((${count} + 1))
|
||||
taskset --all-tasks --pid --cpu-list \
|
||||
${idle_cpulist} ${pid} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
# Save the cpu list to the temp file which will be read and removed when
|
||||
# tasks are reaffined to the platform cores later on.
|
||||
# This list is consumed by SM so it knows about extra cores.
|
||||
echo $idle_cpulist > ${TASK_AFFINING_INCOMPLETE}
|
||||
LOG "Affined ${count} processes to idle cores."
|
||||
|
||||
# Wait for affining flag file to disappear. If the timeout period is reached,
|
||||
# affine tasks back to platform cores.
|
||||
watch_start_seconds=${SECONDS}
|
||||
while [ -f ${TASK_AFFINING_INCOMPLETE} ]; do
|
||||
elapsed_seconds=$(( ${SECONDS} - ${watch_start_seconds} ))
|
||||
LOG "Waiting for swact to complete: ${elapsed_seconds} seconds."
|
||||
if [ ${elapsed_seconds} -ge ${WATCH_TIMEOUT_SECONDS} ]; then
|
||||
LOG "Exceeded watch timeout: ${WATCH_TIMEOUT_SECONDS} seconds," \
|
||||
"affining tasks to platform cores."
|
||||
affine_tasks_to_platform_cores
|
||||
LOG "Idle cores watch completed," \
|
||||
"tasks reaffined to platform cores."
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
return $rc
|
||||
}
|
||||
|
||||
@ -164,24 +224,36 @@ function affine_tasks_to_idle_cores {
|
||||
# to re-affine management tasks back to the platform cores.
|
||||
################################################################################
|
||||
function affine_tasks_to_platform_cores {
|
||||
local cpulist
|
||||
local pidlist
|
||||
local rc=0
|
||||
local count=0
|
||||
|
||||
if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then
|
||||
dbg_str="${TAG} Either tasks have never been affined to all/idle"
|
||||
dbg_str="${TAG} cores or they have already been reaffined to"
|
||||
dbg_str="${TAG} platform cores."
|
||||
log_debug "$dbg_str"
|
||||
return 0
|
||||
# Ensure this only runs on AIO
|
||||
if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; }
|
||||
then
|
||||
LOG "Not AIO, nothing to do."
|
||||
return $rc
|
||||
fi
|
||||
|
||||
read cpulist < ${TASK_AFFINING_INCOMPLETE}
|
||||
# Abort if affine-tasks.sh is running
|
||||
if [ -e ${PIDFILE} ]; then
|
||||
pid=$(cat ${PIDFILE})
|
||||
if [ -n "${pid}" -a -e /proc/${pid} ]; then
|
||||
LOG "Aborting, ${pid} already running: ${PIDFILE}."
|
||||
return $rc
|
||||
fi
|
||||
fi
|
||||
|
||||
log_debug "${TAG} Reaffining tasks to platform cores (${PLATFORM_CPUS})..."
|
||||
if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then
|
||||
LOG "Either tasks have never been affined to all/idle cores" \
|
||||
"or they have already been reaffined to platform cores."
|
||||
return $rc
|
||||
fi
|
||||
|
||||
LOG "Reaffining tasks to platform cores (${PLATFORM_CPUS})..."
|
||||
pidlist=( $(reaffineable_pids) )
|
||||
for pid in ${pidlist[@]}; do
|
||||
count=$((${count} + 1))
|
||||
taskset --all-tasks --pid --cpu-list \
|
||||
${PLATFORM_CPUS} ${pid} > /dev/null 2>&1
|
||||
done
|
||||
@ -195,39 +267,42 @@ function affine_tasks_to_platform_cores {
|
||||
taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
rm -rf ${TASK_AFFINING_INCOMPLETE}
|
||||
rm -v -f ${TASK_AFFINING_INCOMPLETE}
|
||||
LOG "Affined ${count} processes to platform cores."
|
||||
|
||||
return $rc
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# The following function can be leveraged by cron tasks
|
||||
# The following function returns a single logical cpu with greatest idle
|
||||
# occupancy. This can be leveraged by cron tasks or other processes.
|
||||
# (e.g., python-keystone)
|
||||
################################################################################
|
||||
function get_most_idle_core {
|
||||
local cpuocc_list
|
||||
local cpu=0
|
||||
local most_idle_value=${IDLE_MARK}
|
||||
local most_idle_value=${IDLEOCC_THRESHOLD}
|
||||
local most_idle_cpu=0
|
||||
|
||||
if [[ "${KERNEL}" == *" RT "* ]]; then
|
||||
echo $cpu
|
||||
return
|
||||
fi
|
||||
declare -a cpuocc_list=( $(sar -P ALL 1 5 | grep Average | awk '{if(NR>2)print $8}') )
|
||||
|
||||
cpuocc_list=($(sar -P ALL 1 5|grep Average|awk '{if(NR>2)print $8}'))
|
||||
|
||||
for idle_value in ${cpuocc_list[@]}; do
|
||||
for cpu in ${!cpuocc_list[@]}; do
|
||||
idle_value=${cpuocc_list[$cpu]}
|
||||
is_vswitch_core $cpu
|
||||
if [ $? -eq 1 ]; then
|
||||
cpu=$(($cpu+1))
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ $(echo "$idle_value > $most_idle_value"|bc) -eq 1 ]; then
|
||||
most_idle_value=$idle_value
|
||||
most_idle_cpu=$cpu
|
||||
is_isolcpus_core $cpu
|
||||
if [ $? -eq 1 ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
if [ $(echo "${idle_value} > ${most_idle_value}" | bc) -eq 1 ]; then
|
||||
most_idle_value=${idle_value}
|
||||
most_idle_cpu=${cpu}
|
||||
fi
|
||||
cpu=$(($cpu+1))
|
||||
done
|
||||
|
||||
echo $most_idle_cpu
|
||||
LOG "get_most_idle_core: cpu=$most_idle_cpu, idleocc=$most_idle_value"
|
||||
echo ${most_idle_cpu}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user