Merge "AIO reaffine tasks and k8s-infra during startup"
This commit is contained in:
commit
4fe2b6bed5
@ -16,11 +16,13 @@ class platform::compute::config
|
||||
content => template('platform/worker_reserved.conf.erb')
|
||||
}
|
||||
|
||||
if $::platform::params::system_type != 'All-in-one' {
|
||||
file { '/etc/systemd/system.conf.d/platform-cpuaffinity.conf':
|
||||
ensure => 'present',
|
||||
replace => true,
|
||||
content => template('platform/systemd-system-cpuaffinity.conf.erb')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class platform::compute::config::runtime {
|
||||
|
@ -13,3 +13,4 @@ restarts = 3 ; restarts before error assertion
|
||||
startuptime = 5 ; seconds to wait after process start
|
||||
interval = 5 ; number of seconds to wait between restarts
|
||||
debounce = 20 ; number of seconds to wait before degrade clear
|
||||
subfunction = last-config ; run monitor only after last config is run
|
||||
|
@ -189,8 +189,7 @@ class KubernetesPuppet(base.BasePuppet):
|
||||
# TODO(jgauld): Commented out for now, using host_cpuset instead.
|
||||
# nonplatform_cpuset = host_cpuset - platform_cpuset
|
||||
|
||||
if constants.WORKER in utils.get_personalities(host) \
|
||||
and constants.CONTROLLER not in utils.get_personalities(host):
|
||||
if constants.WORKER in utils.get_personalities(host):
|
||||
if self.is_openstack_compute(host):
|
||||
k8s_cpuset = utils.format_range_set(platform_cpuset)
|
||||
k8s_nodeset = utils.format_range_set(platform_nodeset)
|
||||
|
@ -41,19 +41,6 @@ function affine_tasks {
|
||||
local PIDLIST
|
||||
local RET=0
|
||||
|
||||
# Affine non-kernel-thread tasks (excluded [kthreadd] and its children) to all available
|
||||
# cores. They will be reaffined to platform cores later on as part of nova-compute
|
||||
# launch.
|
||||
##log_debug "Affining all tasks to all available CPUs..."
|
||||
# TODO: Should revisit this since this leaves a few lingering floating
|
||||
# tasks and does not really work with cgroup cpusets.
|
||||
# Comment out for now. Cleanup required.
|
||||
##affine_tasks_to_all_cores
|
||||
##RET=$?
|
||||
##if [ $RET -ne 0 ]; then
|
||||
## log_error "Some tasks failed to be affined to all cores."
|
||||
##fi
|
||||
|
||||
# Get number of logical cpus
|
||||
N_CPUS=$(cat /proc/cpuinfo 2>/dev/null | \
|
||||
awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}')
|
||||
|
@ -1,10 +1,10 @@
|
||||
[Unit]
|
||||
Description=StarlingX Affine Tasks
|
||||
After=syslog.service network.service dbus.service sw-patch.service affine-platform.sh.service
|
||||
Before=kubelet.service
|
||||
Before=workerconfig.service
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
Type=simple
|
||||
ExecStart=/etc/init.d/affine-tasks.sh start
|
||||
|
||||
[Install]
|
||||
|
@ -1,62 +1,441 @@
|
||||
#!/bin/bash
|
||||
###############################################################################
|
||||
#
|
||||
# Copyright (c) 2019 Wind River Systems, Inc.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
###############################################################################
|
||||
|
||||
#
|
||||
# chkconfig: 2345 80 80
|
||||
#
|
||||
|
||||
### BEGIN INIT INFO
|
||||
# Provides: affine-tasks
|
||||
# Required-Start:
|
||||
# Required-Stop:
|
||||
# Default-Start: 2 3 4 5
|
||||
# Default-Stop: 0 1 6
|
||||
# Short-Description: affine tasks
|
||||
# Description: This script will affine tasks to the platform cores of the
|
||||
# host. This ensures that system processes are constrained to platform
|
||||
# cores and will not run on cores with VMs/containers.
|
||||
# Short-Description: reaffine tasks on AIO
|
||||
# Description: This script will dynamically reaffine tasks
|
||||
# and k8s-infra cgroup cpuset on AIO nodes only. This accomodates
|
||||
# CPU intensive phases of work. Tasks are initially allowed to float
|
||||
# across all cores. Once system is at steady-state, this will ensure
|
||||
# that K8S pods are constrained to platform cores and do not run on
|
||||
# cores with VMs/containers.
|
||||
### END INIT INFO
|
||||
#
|
||||
# Background:
|
||||
# There is significant parallel CPU intensive activity:
|
||||
# - during stx-application apply before critical openstack pods are running,
|
||||
# e.g., to download docker images, and start all pods.
|
||||
# - during init and pod recovery after reboot or DOR.
|
||||
#
|
||||
# This enables use of all cpus during CPU intensive phase, otherwise the
|
||||
# startup processing time is considerably longer and we easily hit timeout.
|
||||
#
|
||||
# This script waits forever for sufficient platform readiness criteria
|
||||
# (e.g., system critical pods are recovered, nova-compute is running,
|
||||
# cinder-volume is running, openstack pods are running), and we have waited
|
||||
# a short stabilization period before reaffining to the platform cpus.
|
||||
#
|
||||
# NOTE: child cgroup cpuset and nodeset must be a subset of the parent
|
||||
# cgroup's attributes. This requires traversing the tree hierachy in
|
||||
# specific order when dynamically modifying these attributes.
|
||||
#
|
||||
################################################################################
|
||||
# Define minimal path
|
||||
PATH=/bin:/usr/bin:/usr/local/bin
|
||||
|
||||
CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"}
|
||||
[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS}
|
||||
|
||||
. /usr/bin/tsconfig
|
||||
. /etc/init.d/task_affinity_functions.sh
|
||||
# Bring in platform definitions
|
||||
. /etc/platform/platform.conf
|
||||
|
||||
log ()
|
||||
{
|
||||
logger -p local1.info -t affine_tasks $@
|
||||
echo affine_tasks: "$@"
|
||||
# Environment for kubectl
|
||||
export KUBECONFIG=/etc/kubernetes/admin.conf
|
||||
|
||||
# Global parameters
|
||||
CGDIR_K8S=/sys/fs/cgroup/cpuset/k8s-infra
|
||||
INIT_INTERVAL_SECONDS=10
|
||||
CHECK_INTERVAL_SECONDS=30
|
||||
PRINT_INTERVAL_SECONDS=300
|
||||
STABILIZATION_SECONDS=150
|
||||
|
||||
# Define pidfile
|
||||
LNAME=$(readlink -n -f $0)
|
||||
NAME=$(basename $LNAME)
|
||||
PIDFILE=/var/run/${NAME}.pid
|
||||
|
||||
# Define number of logical cpus
|
||||
LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN)
|
||||
|
||||
# Define the memory nodeset and cpuset that span all online cpus and nodes
|
||||
ONLINE_NODES=$(/bin/cat /sys/devices/system/node/online)
|
||||
ONLINE_CPUS=$(/bin/cat /sys/devices/system/cpu/online)
|
||||
ONLINE_MASK=$(cpulist_to_cpumap ${ONLINE_CPUS} ${LOGICAL_CPUS} | \
|
||||
awk '{print tolower($0)}')
|
||||
|
||||
ISOL_CPUS=$(/bin/cat /sys/devices/system/cpu/isolated)
|
||||
if [ ! -z "${ISOL_CPUS}" ]; then
|
||||
ISOL_CPUMAP=$(cpulist_to_cpumap ${ISOL_CPUS} ${LOGICAL_CPUS})
|
||||
NONISOL_CPUMAP=$(invert_cpumap ${ISOL_CPUMAP} ${LOGICAL_CPUS})
|
||||
NONISOL_CPUS=$(cpumap_to_cpulist ${NONISOL_CPUMAP} ${LOGICAL_CPUS})
|
||||
NONISOL_MASK=$(cpulist_to_cpumap ${NONISOL_CPUS} ${LOGICAL_CPUS} | \
|
||||
awk '{print tolower($0)}')
|
||||
else
|
||||
ISOL_CPUMAP='0'
|
||||
NONISOL_CPUS=${ONLINE_CPUS}
|
||||
NONISOL_MASK=${ONLINE_MASK}
|
||||
fi
|
||||
|
||||
# Define platform memory nodeset and cpuset
|
||||
PLATFORM_NODES=$(cat /sys/devices/system/node/online)
|
||||
PLATFORM_CPUS=$(platform_expanded_cpu_list)
|
||||
|
||||
# Global variables
|
||||
NOT_READY_REASON=""
|
||||
STABLE=0
|
||||
|
||||
# Log info message to /var/log/daemon.log
|
||||
function LOG {
|
||||
logger -p daemon.info -t "${NAME}($$): " "$@"
|
||||
}
|
||||
|
||||
start ()
|
||||
{
|
||||
log "Starting affine_tasks. Reaffining tasks to platform cores..."
|
||||
if [ ! -f ${INITIAL_CONFIG_COMPLETE_FLAG} ]; then
|
||||
log "Initial Configuration incomplete. Skipping affining tasks."
|
||||
exit 0
|
||||
# Log error message to /var/log/daemon.log
|
||||
function ERROR {
|
||||
logger -s -p daemon.error -t "${NAME}($$): " "$@"
|
||||
}
|
||||
|
||||
# Update cgroup k8s-infra cpuset and nodeset to span all non-isolated cpus.
|
||||
function update_cgroup_cpuset_k8s_infra_all {
|
||||
# Set all cgroup cpuset and nodeset in tree hierarchy order.
|
||||
# This will always work, no matter the previous cpuset state.
|
||||
find ${CGDIR_K8S} -type d | \
|
||||
while read d; do
|
||||
/bin/echo ${ONLINE_NODES} > ${d}/cpuset.mems 2>/dev/null
|
||||
/bin/echo ${NONISOL_CPUS} > ${d}/cpuset.cpus 2>/dev/null
|
||||
done
|
||||
LOG "Update ${CGDIR_K8S}," \
|
||||
"ONLINE_NODES=${ONLINE_NODES}, NONISOL_CPUS=${NONISOL_CPUS}"
|
||||
}
|
||||
|
||||
# Update cgroup k8s-infra to span platform cpuset and nodeset.
|
||||
function update_cgroup_cpuset_k8s_infra_platform {
|
||||
# Clear any existing cpuset settings. This ensures that the
|
||||
# subsequent shrink to platform cpuset will always work.
|
||||
update_cgroup_cpuset_k8s_infra_all
|
||||
|
||||
# Set all cgroup cpuset and nodeset in depth-first order.
|
||||
# NOTE: this only works if we are shrinking the cpuset.
|
||||
find ${CGDIR_K8S} -depth -type d | \
|
||||
while read d; do
|
||||
/bin/echo ${PLATFORM_NODES} > ${d}/cpuset.mems 2>/dev/null
|
||||
/bin/echo ${PLATFORM_CPUS} > ${d}/cpuset.cpus 2>/dev/null
|
||||
done
|
||||
LOG "Update ${CGDIR_K8S}," \
|
||||
"PLATFORM_NODES=${PLATFORM_NODES}, PLATFORM_CPUS=${PLATFORM_CPUS}"
|
||||
}
|
||||
|
||||
# Check criteria for K8s platform ready on this node.
|
||||
# i.e., k8s-infra is configured, kubelet is running
|
||||
function is_k8s_platform_ready {
|
||||
local PASS=0
|
||||
local FAIL=1
|
||||
|
||||
# Global variable
|
||||
NOT_READY_REASON=""
|
||||
|
||||
# Check that cgroup cpuset k8s-infra has been configured
|
||||
if [ ! -e ${CGDIR_K8S} ]; then
|
||||
NOT_READY_REASON="k8s-infra not configured"
|
||||
return ${FAIL}
|
||||
fi
|
||||
# TODO: Should revisit this since this leaves a few lingering floating
|
||||
# tasks and does not really work with cgroup cpusets.
|
||||
# Comment out for now. Cleanup required.
|
||||
##affine_tasks_to_platform_cores
|
||||
##[[ $? -eq 0 ]] && log "Tasks re-affining done." || log "Tasks re-affining failed."
|
||||
|
||||
# Check that kubelet is running and stable
|
||||
if systemctl is-active kubelet --quiet; then
|
||||
PID=$(systemctl show kubelet.service -p MainPID | \
|
||||
awk -vFS='=' '{print $2}')
|
||||
if [ ${PID} -eq 0 ]; then
|
||||
NOT_READY_REASON="kubelet not running"
|
||||
return ${FAIL}
|
||||
fi
|
||||
up=$(ps -p ${PID} -o etimes= 2>/dev/null | awk '{print $1}')
|
||||
if ! { [ -n "${up}" -a ${up} -ge 30 ]; }
|
||||
then
|
||||
NOT_READY_REASON="kubelet not yet stable"
|
||||
return ${FAIL}
|
||||
fi
|
||||
else
|
||||
NOT_READY_REASON="kubelet not running"
|
||||
return ${FAIL}
|
||||
fi
|
||||
|
||||
LOG "kubelet is ready"
|
||||
return ${PASS}
|
||||
}
|
||||
|
||||
stop ()
|
||||
{
|
||||
log "Stopping affine_tasks..."
|
||||
# Determine whether this node has 'static' cpu manager policy.
|
||||
# NOTE: This check assumes that kubelet is already running locally.
|
||||
function is_static_cpu_manager_policy {
|
||||
local PASS=0
|
||||
local FAIL=1
|
||||
|
||||
state=$(cat /var/lib/kubelet/cpu_manager_state 2>/dev/null)
|
||||
if [[ $state =~ \"policyName\":.?\"static\" ]]; then
|
||||
return ${PASS}
|
||||
else
|
||||
return ${FAIL}
|
||||
fi
|
||||
}
|
||||
|
||||
status()
|
||||
{
|
||||
# Check criteria for K8s platform steady-state ready on this node.
|
||||
# i.e., kube-system pods have recovered, kube application apply
|
||||
# has completed, nova-compute is running, cinder-volume is running.
|
||||
# NOTE: This function depends on kubectl commands, so is only
|
||||
# usable on controllers.
|
||||
function is_k8s_platform_steady_state_ready {
|
||||
local PASS=0
|
||||
local FAIL=1
|
||||
local this_node=${HOSTNAME}
|
||||
|
||||
# Global variable
|
||||
NOT_READY_REASON=""
|
||||
|
||||
# Check that kube-system pods have recovered on this node
|
||||
npods=$(kubectl get pods --namespace kube-system --no-headers \
|
||||
--field-selector spec.nodeName=${this_node} 2>/dev/null | \
|
||||
awk '
|
||||
BEGIN { n=0; }
|
||||
!/Completed|Running/ { n+=1 }
|
||||
END { printf "%d\n", n; }
|
||||
')
|
||||
if [ ${npods} -gt 0 ]; then
|
||||
NOT_READY_REASON="${npods} kube-system pods not recovered"
|
||||
STABLE=0
|
||||
return ${FAIL}
|
||||
fi
|
||||
|
||||
# Wait for a few critical openstack pods to be running if this is
|
||||
# an openstack-compute-node. This is not an exhaustive list.
|
||||
# Make sure that all openstack pods on this node are running.
|
||||
labels=$(kubectl get node ${this_node} \
|
||||
--no-headers --show-labels 2>/dev/null | awk '{print $NF}')
|
||||
if [[ $labels =~ openstack-compute-node=enabled ]]; then
|
||||
# nova-compute is one of the last charts to recover after reboot
|
||||
PODS=( $(kubectl get pods --namespace openstack --no-headers \
|
||||
--selector application=nova,component=compute \
|
||||
--field-selector \
|
||||
spec.nodeName=${this_node},status.phase=Running 2>/dev/null) )
|
||||
if [ ${#PODS[@]} -eq 0 ]; then
|
||||
NOT_READY_REASON="nova-compute pod not running"
|
||||
STABLE=0
|
||||
return ${FAIL}
|
||||
fi
|
||||
|
||||
# cinder-volume is one of the last charts to recover after reboot
|
||||
PODS=( $(kubectl get pods --namespace openstack --no-headers \
|
||||
--selector application=cinder,component=volume \
|
||||
--field-selector \
|
||||
spec.nodeName=${this_node},status.phase=Running 2>/dev/null) )
|
||||
if [ ${#PODS[@]} -eq 0 ]; then
|
||||
NOT_READY_REASON="cinder-volume pod not running"
|
||||
STABLE=0
|
||||
return ${FAIL}
|
||||
fi
|
||||
|
||||
# Check that all openstack pods on this node have recovered
|
||||
npods=$(kubectl get pods --namespace openstack --no-headers \
|
||||
--field-selector spec.nodeName=${this_node} 2>/dev/null | \
|
||||
awk '
|
||||
BEGIN { n=0; }
|
||||
!/Completed|Running/ { n+=1 }
|
||||
END { printf "%d\n", n; }
|
||||
')
|
||||
if [ ${npods} -gt 0 ]; then
|
||||
NOT_READY_REASON="${npods} openstack pods not recovered"
|
||||
STABLE=0
|
||||
return ${FAIL}
|
||||
fi
|
||||
fi
|
||||
|
||||
# Evaluate elapsed time since check criteria pass
|
||||
if [ ${STABLE} -eq 0 ]; then
|
||||
STABLE=${SECONDS}
|
||||
fi
|
||||
dt=$(( ${SECONDS} - ${STABLE} ))
|
||||
if [ ${dt} -lt ${STABILIZATION_SECONDS} ]; then
|
||||
NOT_READY_REASON="stabilization wait"
|
||||
return ${FAIL}
|
||||
fi
|
||||
|
||||
LOG "K8S is ready"
|
||||
return ${PASS}
|
||||
}
|
||||
|
||||
# Return list of reaffineable pids. This includes all processes, but excludes
|
||||
# kernel threads, vSwitch, and anything in K8S or qemu/kvm.
|
||||
function reaffineable_pids {
|
||||
local pids_excl
|
||||
local pidlist
|
||||
|
||||
pids_excl=$(ps -eL -o pid=,comm= | \
|
||||
awk -vORS=',' '/eal-intr-thread|kthreadd/ {print $1}' | \
|
||||
sed 's/,$/\n/')
|
||||
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
|
||||
-o pid=,cgroup= | \
|
||||
awk '!/k8s-infra|machine.slice/ {print $1; }')
|
||||
echo "${pidlist[@]}"
|
||||
}
|
||||
|
||||
function affine_tasks_to_all_cores {
|
||||
local pidlist
|
||||
local count=0
|
||||
|
||||
LOG "Affine all tasks, CPUS: ${NONISOL_CPUS};" \
|
||||
"online=${ONLINE_CPUS} (0x${ONLINE_MASK})," \
|
||||
"isol=${ISOL_CPUS}, nonisol=${NONISOL_CPUS} (0x${NONISOL_MASK})"
|
||||
|
||||
pidlist=( $(reaffineable_pids) )
|
||||
for pid in ${pidlist[@]}; do
|
||||
count=$((${count} + 1))
|
||||
taskset --all-tasks --pid --cpu-list \
|
||||
${NONISOL_CPUS} ${pid} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
LOG "Affined ${count} processes to all cores."
|
||||
}
|
||||
|
||||
function affine_tasks_to_platform_cores {
|
||||
local pidlist
|
||||
local count=0
|
||||
|
||||
LOG "Affine all tasks, PLATFORM_CPUS=${PLATFORM_CPUS}"
|
||||
|
||||
pidlist=( $(reaffineable_pids) )
|
||||
for pid in ${pidlist[@]}; do
|
||||
pid_mask=$(taskset -p $pid 2> /dev/null | awk '{print $6}')
|
||||
if [ "${pid_mask}" == "${NONISOL_MASK}" ]; then
|
||||
count=$((${count} + 1))
|
||||
taskset --all-tasks --pid --cpu-list \
|
||||
${PLATFORM_CPUS} ${pid} > /dev/null 2>&1
|
||||
fi
|
||||
done
|
||||
|
||||
# Reaffine vSwitch tasks that span multiple cpus to platform cpus
|
||||
pidlist=$(ps -eL -o pid=,comm= | awk '/eal-intr-thread/ {print $1}')
|
||||
for pid in ${pidlist[@]}; do
|
||||
count=$((${count} + 1))
|
||||
grep Cpus_allowed_list /proc/${pid}/task/*/status 2>/dev/null | \
|
||||
sed 's#/# #g' | awk '/,|-/ {print $4}' | \
|
||||
xargs --no-run-if-empty -i{} \
|
||||
taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
LOG "Affined ${count} processes to platform cores."
|
||||
}
|
||||
|
||||
function start {
|
||||
# Ensure this only runs on AIO
|
||||
if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; }
|
||||
then
|
||||
LOG "Not AIO, nothing to do."
|
||||
return
|
||||
fi
|
||||
|
||||
# Abort if another instantiation is already running
|
||||
if [ -e ${PIDFILE} ]; then
|
||||
PID=$(cat ${PIDFILE})
|
||||
if [ -n "${PID}" -a -e /proc/${PID} ]; then
|
||||
ERROR "Aborting, ${PID} already running: ${PIDFILE}."
|
||||
exit 1
|
||||
else
|
||||
OUT=$(rm -v -f ${PIDFILE})
|
||||
LOG "${OUT}"
|
||||
fi
|
||||
fi
|
||||
|
||||
LOG "Starting."
|
||||
|
||||
# Create pidfile to indicate the script is running
|
||||
echo $$ > ${PIDFILE}
|
||||
|
||||
# Affine all tasks to float on all cores
|
||||
affine_tasks_to_all_cores
|
||||
|
||||
# Wait for kubelet to be running
|
||||
t0=${SECONDS}
|
||||
until is_k8s_platform_ready; do
|
||||
dt=$(( ${SECONDS} - ${t0} ))
|
||||
if [ ${dt} -ge ${PRINT_INTERVAL_SECONDS} ]; then
|
||||
t0=${SECONDS}
|
||||
LOG "Recovery wait, elapsed ${SECONDS} seconds." \
|
||||
"Reason: ${NOT_READY_REASON}"
|
||||
fi
|
||||
sleep ${INIT_INTERVAL_SECONDS}
|
||||
done
|
||||
|
||||
# Update K8S cpuset so that pods float on all cpus
|
||||
# NOTE: dynamic cpuset changes incompatible with static policy
|
||||
if ! is_static_cpu_manager_policy; then
|
||||
update_cgroup_cpuset_k8s_infra_all
|
||||
fi
|
||||
|
||||
# Wait until K8s pods have recovered and nova-compute is running
|
||||
t0=${SECONDS}
|
||||
until is_k8s_platform_steady_state_ready; do
|
||||
dt=$(( ${SECONDS} - ${t0} ))
|
||||
if [ ${dt} -ge ${PRINT_INTERVAL_SECONDS} ]; then
|
||||
t0=${SECONDS}
|
||||
LOG "Recovery wait, elapsed ${SECONDS} seconds." \
|
||||
"Reason: ${NOT_READY_REASON}"
|
||||
fi
|
||||
sleep ${CHECK_INTERVAL_SECONDS}
|
||||
done
|
||||
|
||||
# Update K8S cpuset to platform cores
|
||||
if ! is_static_cpu_manager_policy; then
|
||||
update_cgroup_cpuset_k8s_infra_platform
|
||||
fi
|
||||
|
||||
# Affine all floating tasks back to platform cores
|
||||
affine_tasks_to_platform_cores
|
||||
|
||||
# Remove pidfile after successful completion
|
||||
rm -f ${PIDFILE}
|
||||
|
||||
LOG "Complete."
|
||||
}
|
||||
|
||||
function stop {
|
||||
LOG "Stopping."
|
||||
|
||||
# Forcibly stop any running instantiation
|
||||
if [ -e ${PIDFILE} ]; then
|
||||
PID=$(cat ${PIDFILE})
|
||||
if [ -n "${PID}" -a -e /proc/${PID} ]; then
|
||||
LOG "Stopping ${PID}: ${PIDFILE}."
|
||||
kill -9 ${PID}
|
||||
timeout 20 tail --pid=${PID} -f /dev/null
|
||||
fi
|
||||
OUT=$(rm -v -f ${PIDFILE})
|
||||
LOG "${OUT}"
|
||||
fi
|
||||
}
|
||||
|
||||
function status {
|
||||
:
|
||||
}
|
||||
|
||||
reset()
|
||||
{
|
||||
function reset {
|
||||
:
|
||||
}
|
||||
|
||||
if [ ${UID} -ne 0 ]; then
|
||||
ERROR "Need sudo/root permission."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "$1" in
|
||||
start)
|
||||
start
|
||||
|
@ -29,19 +29,22 @@ LOG_DEBUG=1
|
||||
TAG="TASKAFFINITY:"
|
||||
|
||||
TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
|
||||
N_CPUS=$(cat /proc/cpuinfo 2>/dev/null | \
|
||||
awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}')
|
||||
N_CPUS=$(getconf _NPROCESSORS_ONLN)
|
||||
FULLSET_CPUS="0-"$((N_CPUS-1))
|
||||
FULLSET_MASK=$(cpulist_to_cpumap ${FULLSET_CPUS} ${N_CPUS})
|
||||
PLATFORM_CPUS=$(get_platform_cpu_list)
|
||||
PLATFORM_CPULIST=$(get_platform_cpu_list| \
|
||||
PLATFORM_CPUS=$(platform_expanded_cpu_list)
|
||||
PLATFORM_CPULIST=$(platform_expanded_cpu_list| \
|
||||
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
||||
sed 's/,/ /g')
|
||||
VSWITCH_CPULIST=$(get_vswitch_cpu_list| \
|
||||
perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
|
||||
sed 's/,/ /g')
|
||||
if [[ $vswitch_type =~ none ]]; then
|
||||
VSWITCH_CPULIST=""
|
||||
fi
|
||||
|
||||
IDLE_MARK=95.0
|
||||
KERNEL=`uname -a`
|
||||
KERNEL=$(uname -a)
|
||||
|
||||
################################################################################
|
||||
# Check if a given core is one of the platform cores
|
||||
@ -69,98 +72,19 @@ function is_vswitch_core {
|
||||
return 0
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# An audit and corrective action following a swact
|
||||
################################################################################
|
||||
function audit_and_reaffine {
|
||||
local mask=$1
|
||||
local cmd_str=""
|
||||
local tasklist
|
||||
|
||||
cmd_str="ps-sched.sh|awk '(\$9==\"$mask\") {print \$2}'"
|
||||
|
||||
tasklist=($(eval $cmd_str))
|
||||
# log_debug "cmd str = $cmd_str"
|
||||
log_debug "${TAG} There are ${#tasklist[@]} tasks to reaffine."
|
||||
|
||||
for task in ${tasklist[@]}; do
|
||||
taskset -acp ${PLATFORM_CPUS} $task &> /dev/null
|
||||
rc=$?
|
||||
[[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc"
|
||||
done
|
||||
tasklist=($(eval $cmd_str))
|
||||
[[ ${#tasklist[@]} -eq 0 ]] && return 0 || return 1
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# The following function is used to verify that any sleeping management tasks
|
||||
# that are on non-platform cores can be migrated to platform cores as soon as
|
||||
# they are scheduled. It can be invoked either manually or from goenableCompute
|
||||
# script as a scheduled job (with a few minute delay) if desired.
|
||||
# The induced tasks migration should be done after all VMs have been restored
|
||||
# following a host reboot in AIO, hence the delay.
|
||||
################################################################################
|
||||
function move_inactive_threads_to_platform_cores {
|
||||
local tasklist
|
||||
local cmd_str=""
|
||||
|
||||
# Compile a list of non-kernel & non-vswitch/VM related threads that are not
|
||||
# on platform cores.
|
||||
# e.g. if the platform cpulist value is "0 8", the resulting command to be
|
||||
# evaluated should look like this:
|
||||
# ps-sched.sh|grep -v vswitch|awk '($10!=0 && $10!=8 && $3!=2) {if(NR>1)print $2}'
|
||||
cmd_str="ps-sched.sh|grep -v vswitch|awk '("
|
||||
for cpu_num in ${PLATFORM_CPULIST}; do
|
||||
cmd_str=$cmd_str"\$10!="${cpu_num}" && "
|
||||
done
|
||||
cmd_str=$cmd_str"\$3!=2) {if(NR>1)print \$2}'"
|
||||
echo "selection string = $cmd_str"
|
||||
tasklist=($(eval $cmd_str))
|
||||
log_debug "${TAG} There are ${#tasklist[@]} number of tasks to be moved."
|
||||
|
||||
# These sleep tasks are stuck on the wrong core(s). They need to be woken up
|
||||
# so they can be migrated to the right ones. Attaching and detaching strace
|
||||
# momentarily to the task does the trick.
|
||||
for task in ${tasklist[@]}; do
|
||||
strace -p $task 2>/dev/null &
|
||||
pid=$!
|
||||
sleep 0.1
|
||||
kill -SIGINT $pid
|
||||
done
|
||||
tasklist=($(eval $cmd_str))
|
||||
[[ ${#tasklist[@]} -eq 0 ]] && return 0 || return 1
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# The following function is called by affine-platform.sh to affine tasks to
|
||||
# all available cores during initial startup and subsequent host reboots.
|
||||
################################################################################
|
||||
function affine_tasks_to_all_cores {
|
||||
# Return list of reaffineable pids. This includes all processes, but excludes
|
||||
# kernel threads, vSwitch, and anything in K8S or qemu/kvm.
|
||||
function reaffineable_pids {
|
||||
local pids_excl
|
||||
local pidlist
|
||||
local rc=0
|
||||
|
||||
if [[ "${KERNEL}" == *" RT "* ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
log_debug "${TAG} Affining all tasks to CPU (${FULLSET_CPUS})"
|
||||
|
||||
pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }')
|
||||
for pid in ${pidlist[@]}; do
|
||||
ppid=$(ps -o ppid= -p $pid |tr -d '[:space:]')
|
||||
if [ -z $ppid ] || [ $ppid -eq 2 ]; then
|
||||
continue
|
||||
fi
|
||||
log_debug "Affining pid $pid, parent pid = $ppid"
|
||||
taskset --all-tasks --pid --cpu-list ${FULLSET_CPUS} $pid &> /dev/null
|
||||
rc=$?
|
||||
[[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc"
|
||||
done
|
||||
# Write the cpu list to a temp file which will be read and removed when
|
||||
# the tasks are reaffined back to platform cores later on.
|
||||
echo ${FULLSET_CPUS} > ${TASK_AFFINING_INCOMPLETE}
|
||||
|
||||
return $rc
|
||||
pids_excl=$(ps -eL -o pid=,comm= | \
|
||||
awk -vORS=',' '/eal-intr-thread|kthreadd/ {print $1}' | \
|
||||
sed 's/,$/\n/')
|
||||
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
|
||||
-o pid=,cgroup= | \
|
||||
awk '!/k8s-infra|machine.slice/ {print $1; }')
|
||||
echo "${pidlist[@]}"
|
||||
}
|
||||
|
||||
################################################################################
|
||||
@ -211,32 +135,22 @@ function affine_tasks_to_idle_cores {
|
||||
# Platform core is added to the idle list by default
|
||||
idle_cpulist=$idle_cpulist$cpu","
|
||||
else
|
||||
# Non platform core is added to the idle list if it is more than 95% idle
|
||||
[[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]] && idle_cpulist=$idle_cpulist$cpu","
|
||||
# Non platform core is added to the idle list if it is more
|
||||
# than 95% idle
|
||||
if [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]]; then
|
||||
idle_cpulist=$idle_cpulist$cpu","
|
||||
fi
|
||||
fi
|
||||
cpu=$(($cpu+1))
|
||||
done
|
||||
|
||||
idle_cpulist=$(echo $idle_cpulist|sed 's/.$//')
|
||||
platform_affinity_mask=$(cpulist_to_cpumap ${PLATFORM_CPUS} ${N_CPUS} \
|
||||
|awk '{print tolower($0)}')
|
||||
|
||||
log_debug "${TAG} Affining all tasks to idle CPU ($idle_cpulist)"
|
||||
|
||||
vswitch_pid=$(pgrep vswitch)
|
||||
pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }')
|
||||
pidlist=( $(reaffineable_pids) )
|
||||
for pid in ${pidlist[@]}; do
|
||||
ppid=$(ps -o ppid= -p $pid |tr -d '[:space:]')
|
||||
if [ -z $ppid ] || [ $ppid -eq 2 ] || [ "$pid" = "$vswitch_pid" ]; then
|
||||
continue
|
||||
fi
|
||||
pid_affinity_mask=$(taskset -p $pid | awk '{print $6}')
|
||||
if [ "${pid_affinity_mask}" == "${platform_affinity_mask}" ]; then
|
||||
# log_debug "Affining pid $pid to idle cores..."
|
||||
taskset --all-tasks --pid --cpu-list $idle_cpulist $pid &> /dev/null
|
||||
rc=$?
|
||||
[[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc"
|
||||
fi
|
||||
taskset --all-tasks --pid --cpu-list \
|
||||
${idle_cpulist} ${pid} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
# Save the cpu list to the temp file which will be read and removed when
|
||||
@ -246,10 +160,7 @@ function affine_tasks_to_idle_cores {
|
||||
}
|
||||
|
||||
################################################################################
|
||||
# The following function is called by either:
|
||||
# a) nova-compute wrapper script during AIO system initial bringup or reboot
|
||||
# or
|
||||
# b) sm at the end of swact sequence
|
||||
# The following function is called by sm at the end of swact sequence
|
||||
# to re-affine management tasks back to the platform cores.
|
||||
################################################################################
|
||||
function affine_tasks_to_platform_cores {
|
||||
@ -259,42 +170,32 @@ function affine_tasks_to_platform_cores {
|
||||
local count=0
|
||||
|
||||
if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then
|
||||
dbg_str="${TAG} Either tasks have never been affined to all/idle cores or"
|
||||
dbg_str=$dbg_str" they have already been reaffined to platform cores."
|
||||
dbg_str="${TAG} Either tasks have never been affined to all/idle"
|
||||
dbg_str="${TAG} cores or they have already been reaffined to"
|
||||
dbg_str="${TAG} platform cores."
|
||||
log_debug "$dbg_str"
|
||||
return 0
|
||||
fi
|
||||
|
||||
read cpulist < ${TASK_AFFINING_INCOMPLETE}
|
||||
affinity_mask=$(cpulist_to_cpumap $cpulist ${N_CPUS}|awk '{print tolower($0)}')
|
||||
|
||||
log_debug "${TAG} Reaffining tasks to platform cores (${PLATFORM_CPUS})..."
|
||||
pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }')
|
||||
pidlist=( $(reaffineable_pids) )
|
||||
for pid in ${pidlist[@]}; do
|
||||
# log_debug "Processing pid $pid..."
|
||||
pid_affinity_mask=$(taskset -p $pid | awk '{print $6}')
|
||||
# Only management tasks need to be reaffined. Kernel, vswitch and VM related
|
||||
# tasks were not affined previously so they should have different affinity
|
||||
# mask(s).
|
||||
if [ "${pid_affinity_mask}" == "${affinity_mask}" ]; then
|
||||
count=$(($count+1))
|
||||
# log_debug "Affining pid $pid to platform cores..."
|
||||
taskset --all-tasks --pid --cpu-list ${PLATFORM_CPUS} $pid &> /dev/null
|
||||
rc=$?
|
||||
[[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc"
|
||||
fi
|
||||
taskset --all-tasks --pid --cpu-list \
|
||||
${PLATFORM_CPUS} ${pid} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
# A workaround for lack of "end of swact" state
|
||||
fullmask=$(echo ${FULLSET_MASK} | awk '{print tolower($0)}')
|
||||
if [ "${affinity_mask}" != "${fullmask}" ]; then
|
||||
log_debug "${TAG} Schedule an audit and cleanup"
|
||||
(sleep 60; audit_and_reaffine "0x"$affinity_mask) &
|
||||
fi
|
||||
# Reaffine vSwitch tasks that span multiple cpus to platform cpus
|
||||
pidlist=$(ps -eL -o pid=,comm= | awk '/eal-intr-thread/ {print $1}')
|
||||
for pid in ${pidlist[@]}; do
|
||||
grep Cpus_allowed_list /proc/${pid}/task/*/status 2>/dev/null | \
|
||||
sed 's#/# #g' | awk '/,|-/ {print $4}' | \
|
||||
xargs --no-run-if-empty -i{} \
|
||||
taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1
|
||||
done
|
||||
|
||||
rm -rf ${TASK_AFFINING_INCOMPLETE}
|
||||
log_debug "${TAG} $count tasks were reaffined to platform cores."
|
||||
|
||||
return $rc
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user