diff --git a/utilities/worker-utils/worker-utils/affine-platform.sh b/utilities/worker-utils/worker-utils/affine-platform.sh index 354970dc..1d0d847c 100755 --- a/utilities/worker-utils/worker-utils/affine-platform.sh +++ b/utilities/worker-utils/worker-utils/affine-platform.sh @@ -41,19 +41,6 @@ function affine_tasks { local PIDLIST local RET=0 - # Affine non-kernel-thread tasks (excluded [kthreadd] and its children) to all available - # cores. They will be reaffined to platform cores later on as part of nova-compute - # launch. - ##log_debug "Affining all tasks to all available CPUs..." - # TODO: Should revisit this since this leaves a few lingering floating - # tasks and does not really work with cgroup cpusets. - # Comment out for now. Cleanup required. - ##affine_tasks_to_all_cores - ##RET=$? - ##if [ $RET -ne 0 ]; then - ## log_error "Some tasks failed to be affined to all cores." - ##fi - # Get number of logical cpus N_CPUS=$(cat /proc/cpuinfo 2>/dev/null | \ awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}') diff --git a/utilities/worker-utils/worker-utils/affine-tasks.service b/utilities/worker-utils/worker-utils/affine-tasks.service index 2248c931..f2e65655 100644 --- a/utilities/worker-utils/worker-utils/affine-tasks.service +++ b/utilities/worker-utils/worker-utils/affine-tasks.service @@ -1,11 +1,11 @@ [Unit] Description=StarlingX Affine Tasks After=syslog.service network.service dbus.service sw-patch.service affine-platform.sh.service -Before=kubelet.service +Before=workerconfig.service [Service] -Type=oneshot +Type=simple ExecStart=/etc/init.d/affine-tasks.sh start [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/utilities/worker-utils/worker-utils/affine-tasks.sh b/utilities/worker-utils/worker-utils/affine-tasks.sh index dc213a0e..303865eb 100644 --- a/utilities/worker-utils/worker-utils/affine-tasks.sh +++ b/utilities/worker-utils/worker-utils/affine-tasks.sh @@ -1,62 +1,441 @@ #!/bin/bash -############################################################################### +# # Copyright (c) 2019 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # -############################################################################### + # +# chkconfig: 2345 80 80 +# + ### BEGIN INIT INFO # Provides: affine-tasks # Required-Start: # Required-Stop: # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 -# Short-Description: affine tasks -# Description: This script will affine tasks to the platform cores of the -# host. This ensures that system processes are constrained to platform -# cores and will not run on cores with VMs/containers. +# Short-Description: reaffine tasks on AIO +# Description: This script will dynamically reaffine tasks +# and k8s-infra cgroup cpuset on AIO nodes only. This accomodates +# CPU intensive phases of work. Tasks are initially allowed to float +# across all cores. Once system is at steady-state, this will ensure +# that K8S pods are constrained to platform cores and do not run on +# cores with VMs/containers. ### END INIT INFO +# +# Background: +# There is significant parallel CPU intensive activity: +# - during stx-application apply before critical openstack pods are running, +# e.g., to download docker images, and start all pods. +# - during init and pod recovery after reboot or DOR. +# +# This enables use of all cpus during CPU intensive phase, otherwise the +# startup processing time is considerably longer and we easily hit timeout. +# +# This script waits forever for sufficient platform readiness criteria +# (e.g., system critical pods are recovered, nova-compute is running, +# cinder-volume is running, openstack pods are running), and we have waited +# a short stabilization period before reaffining to the platform cpus. +# +# NOTE: child cgroup cpuset and nodeset must be a subset of the parent +# cgroup's attributes. This requires traversing the tree hierachy in +# specific order when dynamically modifying these attributes. +# +################################################################################ +# Define minimal path +PATH=/bin:/usr/bin:/usr/local/bin +CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"} +[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS} -. /usr/bin/tsconfig -. /etc/init.d/task_affinity_functions.sh +# Bring in platform definitions +. /etc/platform/platform.conf -log () -{ - logger -p local1.info -t affine_tasks $@ - echo affine_tasks: "$@" +# Environment for kubectl +export KUBECONFIG=/etc/kubernetes/admin.conf + +# Global parameters +CGDIR_K8S=/sys/fs/cgroup/cpuset/k8s-infra +INIT_INTERVAL_SECONDS=10 +CHECK_INTERVAL_SECONDS=30 +PRINT_INTERVAL_SECONDS=300 +STABILIZATION_SECONDS=150 + +# Define pidfile +LNAME=$(readlink -n -f $0) +NAME=$(basename $LNAME) +PIDFILE=/var/run/${NAME}.pid + +# Define number of logical cpus +LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN) + +# Define the memory nodeset and cpuset that span all online cpus and nodes +ONLINE_NODES=$(/bin/cat /sys/devices/system/node/online) +ONLINE_CPUS=$(/bin/cat /sys/devices/system/cpu/online) +ONLINE_MASK=$(cpulist_to_cpumap ${ONLINE_CPUS} ${LOGICAL_CPUS} | \ + awk '{print tolower($0)}') + +ISOL_CPUS=$(/bin/cat /sys/devices/system/cpu/isolated) +if [ ! -z "${ISOL_CPUS}" ]; then + ISOL_CPUMAP=$(cpulist_to_cpumap ${ISOL_CPUS} ${LOGICAL_CPUS}) + NONISOL_CPUMAP=$(invert_cpumap ${ISOL_CPUMAP} ${LOGICAL_CPUS}) + NONISOL_CPUS=$(cpumap_to_cpulist ${NONISOL_CPUMAP} ${LOGICAL_CPUS}) + NONISOL_MASK=$(cpulist_to_cpumap ${NONISOL_CPUS} ${LOGICAL_CPUS} | \ + awk '{print tolower($0)}') +else + ISOL_CPUMAP='0' + NONISOL_CPUS=${ONLINE_CPUS} + NONISOL_MASK=${ONLINE_MASK} +fi + +# Define platform memory nodeset and cpuset +PLATFORM_NODES=$(cat /sys/devices/system/node/online) +PLATFORM_CPUS=$(platform_expanded_cpu_list) + +# Global variables +NOT_READY_REASON="" +STABLE=0 + +# Log info message to /var/log/daemon.log +function LOG { + logger -p daemon.info -t "${NAME}($$): " "$@" } -start () -{ - log "Starting affine_tasks. Reaffining tasks to platform cores..." - if [ ! -f ${INITIAL_CONFIG_COMPLETE_FLAG} ]; then - log "Initial Configuration incomplete. Skipping affining tasks." - exit 0 +# Log error message to /var/log/daemon.log +function ERROR { + logger -s -p daemon.error -t "${NAME}($$): " "$@" +} + +# Update cgroup k8s-infra cpuset and nodeset to span all non-isolated cpus. +function update_cgroup_cpuset_k8s_infra_all { + # Set all cgroup cpuset and nodeset in tree hierarchy order. + # This will always work, no matter the previous cpuset state. + find ${CGDIR_K8S} -type d | \ + while read d; do + /bin/echo ${ONLINE_NODES} > ${d}/cpuset.mems 2>/dev/null + /bin/echo ${NONISOL_CPUS} > ${d}/cpuset.cpus 2>/dev/null + done + LOG "Update ${CGDIR_K8S}," \ + "ONLINE_NODES=${ONLINE_NODES}, NONISOL_CPUS=${NONISOL_CPUS}" +} + +# Update cgroup k8s-infra to span platform cpuset and nodeset. +function update_cgroup_cpuset_k8s_infra_platform { + # Clear any existing cpuset settings. This ensures that the + # subsequent shrink to platform cpuset will always work. + update_cgroup_cpuset_k8s_infra_all + + # Set all cgroup cpuset and nodeset in depth-first order. + # NOTE: this only works if we are shrinking the cpuset. + find ${CGDIR_K8S} -depth -type d | \ + while read d; do + /bin/echo ${PLATFORM_NODES} > ${d}/cpuset.mems 2>/dev/null + /bin/echo ${PLATFORM_CPUS} > ${d}/cpuset.cpus 2>/dev/null + done + LOG "Update ${CGDIR_K8S}," \ + "PLATFORM_NODES=${PLATFORM_NODES}, PLATFORM_CPUS=${PLATFORM_CPUS}" +} + +# Check criteria for K8s platform ready on this node. +# i.e., k8s-infra is configured, kubelet is running +function is_k8s_platform_ready { + local PASS=0 + local FAIL=1 + + # Global variable + NOT_READY_REASON="" + + # Check that cgroup cpuset k8s-infra has been configured + if [ ! -e ${CGDIR_K8S} ]; then + NOT_READY_REASON="k8s-infra not configured" + return ${FAIL} fi - # TODO: Should revisit this since this leaves a few lingering floating - # tasks and does not really work with cgroup cpusets. - # Comment out for now. Cleanup required. - ##affine_tasks_to_platform_cores - ##[[ $? -eq 0 ]] && log "Tasks re-affining done." || log "Tasks re-affining failed." + + # Check that kubelet is running and stable + if systemctl is-active kubelet --quiet; then + PID=$(systemctl show kubelet.service -p MainPID | \ + awk -vFS='=' '{print $2}') + if [ ${PID} -eq 0 ]; then + NOT_READY_REASON="kubelet not running" + return ${FAIL} + fi + up=$(ps -p ${PID} -o etimes= 2>/dev/null | awk '{print $1}') + if ! { [ -n "${up}" -a ${up} -ge 30 ]; } + then + NOT_READY_REASON="kubelet not yet stable" + return ${FAIL} + fi + else + NOT_READY_REASON="kubelet not running" + return ${FAIL} + fi + + LOG "kubelet is ready" + return ${PASS} } -stop () -{ - log "Stopping affine_tasks..." +# Determine whether this node has 'static' cpu manager policy. +# NOTE: This check assumes that kubelet is already running locally. +function is_static_cpu_manager_policy { + local PASS=0 + local FAIL=1 + + state=$(cat /var/lib/kubelet/cpu_manager_state 2>/dev/null) + if [[ $state =~ \"policyName\":.?\"static\" ]]; then + return ${PASS} + else + return ${FAIL} + fi } -status() -{ +# Check criteria for K8s platform steady-state ready on this node. +# i.e., kube-system pods have recovered, kube application apply +# has completed, nova-compute is running, cinder-volume is running. +# NOTE: This function depends on kubectl commands, so is only +# usable on controllers. +function is_k8s_platform_steady_state_ready { + local PASS=0 + local FAIL=1 + local this_node=${HOSTNAME} + + # Global variable + NOT_READY_REASON="" + + # Check that kube-system pods have recovered on this node + npods=$(kubectl get pods --namespace kube-system --no-headers \ + --field-selector spec.nodeName=${this_node} 2>/dev/null | \ + awk ' +BEGIN { n=0; } +!/Completed|Running/ { n+=1 } +END { printf "%d\n", n; } +') + if [ ${npods} -gt 0 ]; then + NOT_READY_REASON="${npods} kube-system pods not recovered" + STABLE=0 + return ${FAIL} + fi + + # Wait for a few critical openstack pods to be running if this is + # an openstack-compute-node. This is not an exhaustive list. + # Make sure that all openstack pods on this node are running. + labels=$(kubectl get node ${this_node} \ + --no-headers --show-labels 2>/dev/null | awk '{print $NF}') + if [[ $labels =~ openstack-compute-node=enabled ]]; then + # nova-compute is one of the last charts to recover after reboot + PODS=( $(kubectl get pods --namespace openstack --no-headers \ + --selector application=nova,component=compute \ + --field-selector \ + spec.nodeName=${this_node},status.phase=Running 2>/dev/null) ) + if [ ${#PODS[@]} -eq 0 ]; then + NOT_READY_REASON="nova-compute pod not running" + STABLE=0 + return ${FAIL} + fi + + # cinder-volume is one of the last charts to recover after reboot + PODS=( $(kubectl get pods --namespace openstack --no-headers \ + --selector application=cinder,component=volume \ + --field-selector \ + spec.nodeName=${this_node},status.phase=Running 2>/dev/null) ) + if [ ${#PODS[@]} -eq 0 ]; then + NOT_READY_REASON="cinder-volume pod not running" + STABLE=0 + return ${FAIL} + fi + + # Check that all openstack pods on this node have recovered + npods=$(kubectl get pods --namespace openstack --no-headers \ + --field-selector spec.nodeName=${this_node} 2>/dev/null | \ + awk ' +BEGIN { n=0; } +!/Completed|Running/ { n+=1 } +END { printf "%d\n", n; } +') + if [ ${npods} -gt 0 ]; then + NOT_READY_REASON="${npods} openstack pods not recovered" + STABLE=0 + return ${FAIL} + fi + fi + + # Evaluate elapsed time since check criteria pass + if [ ${STABLE} -eq 0 ]; then + STABLE=${SECONDS} + fi + dt=$(( ${SECONDS} - ${STABLE} )) + if [ ${dt} -lt ${STABILIZATION_SECONDS} ]; then + NOT_READY_REASON="stabilization wait" + return ${FAIL} + fi + + LOG "K8S is ready" + return ${PASS} +} + +# Return list of reaffineable pids. This includes all processes, but excludes +# kernel threads, vSwitch, and anything in K8S or qemu/kvm. +function reaffineable_pids { + local pids_excl + local pidlist + + pids_excl=$(ps -eL -o pid=,comm= | \ + awk -vORS=',' '/eal-intr-thread|kthreadd/ {print $1}' | \ + sed 's/,$/\n/') + pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \ + -o pid=,cgroup= | \ + awk '!/k8s-infra|machine.slice/ {print $1; }') + echo "${pidlist[@]}" +} + +function affine_tasks_to_all_cores { + local pidlist + local count=0 + + LOG "Affine all tasks, CPUS: ${NONISOL_CPUS};" \ + "online=${ONLINE_CPUS} (0x${ONLINE_MASK})," \ + "isol=${ISOL_CPUS}, nonisol=${NONISOL_CPUS} (0x${NONISOL_MASK})" + + pidlist=( $(reaffineable_pids) ) + for pid in ${pidlist[@]}; do + count=$((${count} + 1)) + taskset --all-tasks --pid --cpu-list \ + ${NONISOL_CPUS} ${pid} > /dev/null 2>&1 + done + + LOG "Affined ${count} processes to all cores." +} + +function affine_tasks_to_platform_cores { + local pidlist + local count=0 + + LOG "Affine all tasks, PLATFORM_CPUS=${PLATFORM_CPUS}" + + pidlist=( $(reaffineable_pids) ) + for pid in ${pidlist[@]}; do + pid_mask=$(taskset -p $pid 2> /dev/null | awk '{print $6}') + if [ "${pid_mask}" == "${NONISOL_MASK}" ]; then + count=$((${count} + 1)) + taskset --all-tasks --pid --cpu-list \ + ${PLATFORM_CPUS} ${pid} > /dev/null 2>&1 + fi + done + + # Reaffine vSwitch tasks that span multiple cpus to platform cpus + pidlist=$(ps -eL -o pid=,comm= | awk '/eal-intr-thread/ {print $1}') + for pid in ${pidlist[@]}; do + count=$((${count} + 1)) + grep Cpus_allowed_list /proc/${pid}/task/*/status 2>/dev/null | \ + sed 's#/# #g' | awk '/,|-/ {print $4}' | \ + xargs --no-run-if-empty -i{} \ + taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1 + done + + LOG "Affined ${count} processes to platform cores." +} + +function start { + # Ensure this only runs on AIO + if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; } + then + LOG "Not AIO, nothing to do." + return + fi + + # Abort if another instantiation is already running + if [ -e ${PIDFILE} ]; then + PID=$(cat ${PIDFILE}) + if [ -n "${PID}" -a -e /proc/${PID} ]; then + ERROR "Aborting, ${PID} already running: ${PIDFILE}." + exit 1 + else + OUT=$(rm -v -f ${PIDFILE}) + LOG "${OUT}" + fi + fi + + LOG "Starting." + + # Create pidfile to indicate the script is running + echo $$ > ${PIDFILE} + + # Affine all tasks to float on all cores + affine_tasks_to_all_cores + + # Wait for kubelet to be running + t0=${SECONDS} + until is_k8s_platform_ready; do + dt=$(( ${SECONDS} - ${t0} )) + if [ ${dt} -ge ${PRINT_INTERVAL_SECONDS} ]; then + t0=${SECONDS} + LOG "Recovery wait, elapsed ${SECONDS} seconds." \ + "Reason: ${NOT_READY_REASON}" + fi + sleep ${INIT_INTERVAL_SECONDS} + done + + # Update K8S cpuset so that pods float on all cpus + # NOTE: dynamic cpuset changes incompatible with static policy + if ! is_static_cpu_manager_policy; then + update_cgroup_cpuset_k8s_infra_all + fi + + # Wait until K8s pods have recovered and nova-compute is running + t0=${SECONDS} + until is_k8s_platform_steady_state_ready; do + dt=$(( ${SECONDS} - ${t0} )) + if [ ${dt} -ge ${PRINT_INTERVAL_SECONDS} ]; then + t0=${SECONDS} + LOG "Recovery wait, elapsed ${SECONDS} seconds." \ + "Reason: ${NOT_READY_REASON}" + fi + sleep ${CHECK_INTERVAL_SECONDS} + done + + # Update K8S cpuset to platform cores + if ! is_static_cpu_manager_policy; then + update_cgroup_cpuset_k8s_infra_platform + fi + + # Affine all floating tasks back to platform cores + affine_tasks_to_platform_cores + + # Remove pidfile after successful completion + rm -f ${PIDFILE} + + LOG "Complete." +} + +function stop { + LOG "Stopping." + + # Forcibly stop any running instantiation + if [ -e ${PIDFILE} ]; then + PID=$(cat ${PIDFILE}) + if [ -n "${PID}" -a -e /proc/${PID} ]; then + LOG "Stopping ${PID}: ${PIDFILE}." + kill -9 ${PID} + timeout 20 tail --pid=${PID} -f /dev/null + fi + OUT=$(rm -v -f ${PIDFILE}) + LOG "${OUT}" + fi +} + +function status { : } -reset() -{ +function reset { : } +if [ ${UID} -ne 0 ]; then + ERROR "Need sudo/root permission." + exit 1 +fi + case "$1" in start) start diff --git a/utilities/worker-utils/worker-utils/task_affinity_functions.sh b/utilities/worker-utils/worker-utils/task_affinity_functions.sh index 775d1445..4b184310 100755 --- a/utilities/worker-utils/worker-utils/task_affinity_functions.sh +++ b/utilities/worker-utils/worker-utils/task_affinity_functions.sh @@ -29,19 +29,22 @@ LOG_DEBUG=1 TAG="TASKAFFINITY:" TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete" -N_CPUS=$(cat /proc/cpuinfo 2>/dev/null | \ - awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}') +N_CPUS=$(getconf _NPROCESSORS_ONLN) FULLSET_CPUS="0-"$((N_CPUS-1)) FULLSET_MASK=$(cpulist_to_cpumap ${FULLSET_CPUS} ${N_CPUS}) -PLATFORM_CPUS=$(get_platform_cpu_list) -PLATFORM_CPULIST=$(get_platform_cpu_list| \ +PLATFORM_CPUS=$(platform_expanded_cpu_list) +PLATFORM_CPULIST=$(platform_expanded_cpu_list| \ perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ sed 's/,/ /g') VSWITCH_CPULIST=$(get_vswitch_cpu_list| \ perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \ sed 's/,/ /g') +if [[ $vswitch_type =~ none ]]; then + VSWITCH_CPULIST="" +fi + IDLE_MARK=95.0 -KERNEL=`uname -a` +KERNEL=$(uname -a) ################################################################################ # Check if a given core is one of the platform cores @@ -69,98 +72,19 @@ function is_vswitch_core { return 0 } -################################################################################ -# An audit and corrective action following a swact -################################################################################ -function audit_and_reaffine { - local mask=$1 - local cmd_str="" - local tasklist - - cmd_str="ps-sched.sh|awk '(\$9==\"$mask\") {print \$2}'" - - tasklist=($(eval $cmd_str)) - # log_debug "cmd str = $cmd_str" - log_debug "${TAG} There are ${#tasklist[@]} tasks to reaffine." - - for task in ${tasklist[@]}; do - taskset -acp ${PLATFORM_CPUS} $task &> /dev/null - rc=$? - [[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc" - done - tasklist=($(eval $cmd_str)) - [[ ${#tasklist[@]} -eq 0 ]] && return 0 || return 1 -} - -################################################################################ -# The following function is used to verify that any sleeping management tasks -# that are on non-platform cores can be migrated to platform cores as soon as -# they are scheduled. It can be invoked either manually or from goenableCompute -# script as a scheduled job (with a few minute delay) if desired. -# The induced tasks migration should be done after all VMs have been restored -# following a host reboot in AIO, hence the delay. -################################################################################ -function move_inactive_threads_to_platform_cores { - local tasklist - local cmd_str="" - - # Compile a list of non-kernel & non-vswitch/VM related threads that are not - # on platform cores. - # e.g. if the platform cpulist value is "0 8", the resulting command to be - # evaluated should look like this: - # ps-sched.sh|grep -v vswitch|awk '($10!=0 && $10!=8 && $3!=2) {if(NR>1)print $2}' - cmd_str="ps-sched.sh|grep -v vswitch|awk '(" - for cpu_num in ${PLATFORM_CPULIST}; do - cmd_str=$cmd_str"\$10!="${cpu_num}" && " - done - cmd_str=$cmd_str"\$3!=2) {if(NR>1)print \$2}'" - echo "selection string = $cmd_str" - tasklist=($(eval $cmd_str)) - log_debug "${TAG} There are ${#tasklist[@]} number of tasks to be moved." - - # These sleep tasks are stuck on the wrong core(s). They need to be woken up - # so they can be migrated to the right ones. Attaching and detaching strace - # momentarily to the task does the trick. - for task in ${tasklist[@]}; do - strace -p $task 2>/dev/null & - pid=$! - sleep 0.1 - kill -SIGINT $pid - done - tasklist=($(eval $cmd_str)) - [[ ${#tasklist[@]} -eq 0 ]] && return 0 || return 1 -} - -################################################################################ -# The following function is called by affine-platform.sh to affine tasks to -# all available cores during initial startup and subsequent host reboots. -################################################################################ -function affine_tasks_to_all_cores { +# Return list of reaffineable pids. This includes all processes, but excludes +# kernel threads, vSwitch, and anything in K8S or qemu/kvm. +function reaffineable_pids { + local pids_excl local pidlist - local rc=0 - if [[ "${KERNEL}" == *" RT "* ]]; then - return 0 - fi - - log_debug "${TAG} Affining all tasks to CPU (${FULLSET_CPUS})" - - pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }') - for pid in ${pidlist[@]}; do - ppid=$(ps -o ppid= -p $pid |tr -d '[:space:]') - if [ -z $ppid ] || [ $ppid -eq 2 ]; then - continue - fi - log_debug "Affining pid $pid, parent pid = $ppid" - taskset --all-tasks --pid --cpu-list ${FULLSET_CPUS} $pid &> /dev/null - rc=$? - [[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc" - done - # Write the cpu list to a temp file which will be read and removed when - # the tasks are reaffined back to platform cores later on. - echo ${FULLSET_CPUS} > ${TASK_AFFINING_INCOMPLETE} - - return $rc + pids_excl=$(ps -eL -o pid=,comm= | \ + awk -vORS=',' '/eal-intr-thread|kthreadd/ {print $1}' | \ + sed 's/,$/\n/') + pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \ + -o pid=,cgroup= | \ + awk '!/k8s-infra|machine.slice/ {print $1; }') + echo "${pidlist[@]}" } ################################################################################ @@ -208,35 +132,25 @@ function affine_tasks_to_idle_cores { is_platform_core $cpu if [ $? -eq 1 ]; then - # Platform core is added to the idle list by default + # Platform core is added to the idle list by default idle_cpulist=$idle_cpulist$cpu"," else - # Non platform core is added to the idle list if it is more than 95% idle - [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]] && idle_cpulist=$idle_cpulist$cpu"," + # Non platform core is added to the idle list if it is more + # than 95% idle + if [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]]; then + idle_cpulist=$idle_cpulist$cpu"," + fi fi cpu=$(($cpu+1)) done idle_cpulist=$(echo $idle_cpulist|sed 's/.$//') - platform_affinity_mask=$(cpulist_to_cpumap ${PLATFORM_CPUS} ${N_CPUS} \ - |awk '{print tolower($0)}') log_debug "${TAG} Affining all tasks to idle CPU ($idle_cpulist)" - - vswitch_pid=$(pgrep vswitch) - pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }') + pidlist=( $(reaffineable_pids) ) for pid in ${pidlist[@]}; do - ppid=$(ps -o ppid= -p $pid |tr -d '[:space:]') - if [ -z $ppid ] || [ $ppid -eq 2 ] || [ "$pid" = "$vswitch_pid" ]; then - continue - fi - pid_affinity_mask=$(taskset -p $pid | awk '{print $6}') - if [ "${pid_affinity_mask}" == "${platform_affinity_mask}" ]; then - # log_debug "Affining pid $pid to idle cores..." - taskset --all-tasks --pid --cpu-list $idle_cpulist $pid &> /dev/null - rc=$? - [[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc" - fi + taskset --all-tasks --pid --cpu-list \ + ${idle_cpulist} ${pid} > /dev/null 2>&1 done # Save the cpu list to the temp file which will be read and removed when @@ -246,10 +160,7 @@ function affine_tasks_to_idle_cores { } ################################################################################ -# The following function is called by either: -# a) nova-compute wrapper script during AIO system initial bringup or reboot -# or -# b) sm at the end of swact sequence +# The following function is called by sm at the end of swact sequence # to re-affine management tasks back to the platform cores. ################################################################################ function affine_tasks_to_platform_cores { @@ -259,42 +170,32 @@ function affine_tasks_to_platform_cores { local count=0 if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then - dbg_str="${TAG} Either tasks have never been affined to all/idle cores or" - dbg_str=$dbg_str" they have already been reaffined to platform cores." + dbg_str="${TAG} Either tasks have never been affined to all/idle" + dbg_str="${TAG} cores or they have already been reaffined to" + dbg_str="${TAG} platform cores." log_debug "$dbg_str" return 0 fi read cpulist < ${TASK_AFFINING_INCOMPLETE} - affinity_mask=$(cpulist_to_cpumap $cpulist ${N_CPUS}|awk '{print tolower($0)}') log_debug "${TAG} Reaffining tasks to platform cores (${PLATFORM_CPUS})..." - pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }') + pidlist=( $(reaffineable_pids) ) for pid in ${pidlist[@]}; do - # log_debug "Processing pid $pid..." - pid_affinity_mask=$(taskset -p $pid | awk '{print $6}') - # Only management tasks need to be reaffined. Kernel, vswitch and VM related - # tasks were not affined previously so they should have different affinity - # mask(s). - if [ "${pid_affinity_mask}" == "${affinity_mask}" ]; then - count=$(($count+1)) - # log_debug "Affining pid $pid to platform cores..." - taskset --all-tasks --pid --cpu-list ${PLATFORM_CPUS} $pid &> /dev/null - rc=$? - [[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc" - fi + taskset --all-tasks --pid --cpu-list \ + ${PLATFORM_CPUS} ${pid} > /dev/null 2>&1 done - # A workaround for lack of "end of swact" state - fullmask=$(echo ${FULLSET_MASK} | awk '{print tolower($0)}') - if [ "${affinity_mask}" != "${fullmask}" ]; then - log_debug "${TAG} Schedule an audit and cleanup" - (sleep 60; audit_and_reaffine "0x"$affinity_mask) & - fi + # Reaffine vSwitch tasks that span multiple cpus to platform cpus + pidlist=$(ps -eL -o pid=,comm= | awk '/eal-intr-thread/ {print $1}') + for pid in ${pidlist[@]}; do + grep Cpus_allowed_list /proc/${pid}/task/*/status 2>/dev/null | \ + sed 's#/# #g' | awk '/,|-/ {print $4}' | \ + xargs --no-run-if-empty -i{} \ + taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1 + done rm -rf ${TASK_AFFINING_INCOMPLETE} - log_debug "${TAG} $count tasks were reaffined to platform cores." - return $rc }