Merge "AIO reaffine tasks and k8s-infra during startup"

2019-07-16 19:04:34 +00:00 · 2019-07-16 19:04:34 +00:00 · 4fe2b6bed5
commit 4fe2b6bed5
parent ea80dae321 dba4175523
7 changed files with 463 additions and 194 deletions
--- a/puppet-manifests/src/modules/platform/manifests/compute.pp
+++ b/puppet-manifests/src/modules/platform/manifests/compute.pp
@ -16,11 +16,13 @@ class platform::compute::config
      content => template('platform/worker_reserved.conf.erb')
  }

+  if $::platform::params::system_type != 'All-in-one' {
    file { '/etc/systemd/system.conf.d/platform-cpuaffinity.conf':
        ensure  => 'present',
        replace => true,
        content => template('platform/systemd-system-cpuaffinity.conf.erb')
    }
+  }
 }

 class platform::compute::config::runtime {
--- a/puppet-manifests/src/modules/platform/templates/kubelet-pmond-conf.erb
+++ b/puppet-manifests/src/modules/platform/templates/kubelet-pmond-conf.erb
@ -13,3 +13,4 @@ restarts = 3              ; restarts before error assertion
 startuptime = 5           ; seconds to wait after process start
 interval = 5              ; number of seconds to wait between restarts
 debounce = 20             ; number of seconds to wait before degrade clear
+subfunction = last-config ; run monitor only after last config is run
--- a/sysinv/sysinv/sysinv/sysinv/puppet/kubernetes.py
+++ b/sysinv/sysinv/sysinv/sysinv/puppet/kubernetes.py
@ -189,8 +189,7 @@ class KubernetesPuppet(base.BasePuppet):
        # TODO(jgauld): Commented out for now, using host_cpuset instead.
        # nonplatform_cpuset = host_cpuset - platform_cpuset

-        if constants.WORKER in utils.get_personalities(host) \
-                and constants.CONTROLLER not in utils.get_personalities(host):
+        if constants.WORKER in utils.get_personalities(host):
            if self.is_openstack_compute(host):
                k8s_cpuset = utils.format_range_set(platform_cpuset)
                k8s_nodeset = utils.format_range_set(platform_nodeset)
--- a/worker-utils/worker-utils/affine-platform.sh
+++ b/worker-utils/worker-utils/affine-platform.sh
@ -41,19 +41,6 @@ function affine_tasks {
    local PIDLIST
    local RET=0

-    # Affine non-kernel-thread tasks (excluded [kthreadd] and its children) to all available
-    # cores. They will be reaffined to platform cores later on as part of nova-compute
-    # launch.
-    ##log_debug "Affining all tasks to all available CPUs..."
-    # TODO: Should revisit this since this leaves a few lingering floating
-    # tasks and does not really work with cgroup cpusets.
-    # Comment out for now. Cleanup required.
-    ##affine_tasks_to_all_cores
-    ##RET=$?
-    ##if [ $RET -ne 0 ]; then
-    ##    log_error "Some tasks failed to be affined to all cores."
-    ##fi
-
    # Get number of logical cpus
    N_CPUS=$(cat /proc/cpuinfo 2>/dev/null | \
        awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}')
--- a/worker-utils/worker-utils/affine-tasks.service
+++ b/worker-utils/worker-utils/affine-tasks.service
@ -1,10 +1,10 @@
 [Unit]
 Description=StarlingX Affine Tasks
 After=syslog.service network.service dbus.service sw-patch.service affine-platform.sh.service
-Before=kubelet.service
+Before=workerconfig.service

 [Service]
-Type=oneshot
+Type=simple
 ExecStart=/etc/init.d/affine-tasks.sh start

 [Install]
--- a/worker-utils/worker-utils/affine-tasks.sh
+++ b/worker-utils/worker-utils/affine-tasks.sh
@ -1,62 +1,441 @@
 #!/bin/bash
-###############################################################################
+#
 # Copyright (c) 2019 Wind River Systems, Inc.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-###############################################################################
+
 #
+# chkconfig: 2345 80 80
+#
+
 ### BEGIN INIT INFO
 # Provides:          affine-tasks
 # Required-Start:
 # Required-Stop:
 # Default-Start:     2 3 4 5
 # Default-Stop:      0 1 6
-# Short-Description: affine tasks
-# Description:       This script will affine tasks to the platform cores of the
-#       host. This ensures that system processes are constrained to platform
-#       cores and will not run on cores with VMs/containers.
+# Short-Description: reaffine tasks on AIO
+# Description:       This script will dynamically reaffine tasks
+#   and k8s-infra cgroup cpuset on AIO nodes only. This accomodates
+#   CPU intensive phases of work. Tasks are initially allowed to float
+#   across all cores. Once system is at steady-state, this will ensure
+#   that K8S pods are constrained to platform cores and do not run on
+#   cores with VMs/containers.
 ### END INIT INFO
+#
+# Background:
+# There is significant parallel CPU intensive activity:
+# - during stx-application apply before critical openstack pods are running,
+#   e.g., to download docker images, and start all pods.
+# - during init and pod recovery after reboot or DOR.
+#
+# This enables use of all cpus during CPU intensive phase, otherwise the
+# startup processing time is considerably longer and we easily hit timeout.
+#
+# This script waits forever for sufficient platform readiness criteria
+# (e.g., system critical pods are recovered, nova-compute is running,
+# cinder-volume is running, openstack pods are running), and we have waited
+# a short stabilization period before reaffining to the platform cpus.
+#
+# NOTE: child cgroup cpuset and nodeset must be a subset of the parent
+# cgroup's attributes.  This requires traversing the tree hierachy in
+# specific order when dynamically modifying these attributes.
+#
+################################################################################
+# Define minimal path
+PATH=/bin:/usr/bin:/usr/local/bin

+CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"}
+[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS}

-. /usr/bin/tsconfig
-. /etc/init.d/task_affinity_functions.sh
+# Bring in platform definitions
+. /etc/platform/platform.conf

-log ()
-{
-    logger -p local1.info -t affine_tasks $@
-    echo affine_tasks: "$@"
+# Environment for kubectl
+export KUBECONFIG=/etc/kubernetes/admin.conf
+
+# Global parameters
+CGDIR_K8S=/sys/fs/cgroup/cpuset/k8s-infra
+INIT_INTERVAL_SECONDS=10
+CHECK_INTERVAL_SECONDS=30
+PRINT_INTERVAL_SECONDS=300
+STABILIZATION_SECONDS=150
+
+# Define pidfile
+LNAME=$(readlink -n -f $0)
+NAME=$(basename $LNAME)
+PIDFILE=/var/run/${NAME}.pid
+
+# Define number of logical cpus
+LOGICAL_CPUS=$(getconf _NPROCESSORS_ONLN)
+
+# Define the memory nodeset and cpuset that span all online cpus and nodes
+ONLINE_NODES=$(/bin/cat /sys/devices/system/node/online)
+ONLINE_CPUS=$(/bin/cat /sys/devices/system/cpu/online)
+ONLINE_MASK=$(cpulist_to_cpumap ${ONLINE_CPUS} ${LOGICAL_CPUS} | \
+                awk '{print tolower($0)}')
+
+ISOL_CPUS=$(/bin/cat /sys/devices/system/cpu/isolated)
+if [ ! -z "${ISOL_CPUS}" ]; then
+    ISOL_CPUMAP=$(cpulist_to_cpumap ${ISOL_CPUS} ${LOGICAL_CPUS})
+    NONISOL_CPUMAP=$(invert_cpumap ${ISOL_CPUMAP} ${LOGICAL_CPUS})
+    NONISOL_CPUS=$(cpumap_to_cpulist ${NONISOL_CPUMAP} ${LOGICAL_CPUS})
+    NONISOL_MASK=$(cpulist_to_cpumap ${NONISOL_CPUS} ${LOGICAL_CPUS} | \
+                    awk '{print tolower($0)}')
+else
+    ISOL_CPUMAP='0'
+    NONISOL_CPUS=${ONLINE_CPUS}
+    NONISOL_MASK=${ONLINE_MASK}
+fi
+
+# Define platform memory nodeset and cpuset
+PLATFORM_NODES=$(cat /sys/devices/system/node/online)
+PLATFORM_CPUS=$(platform_expanded_cpu_list)
+
+# Global variables
+NOT_READY_REASON=""
+STABLE=0
+
+# Log info message to /var/log/daemon.log
+function LOG {
+    logger -p daemon.info -t "${NAME}($$): " "$@"
 }

-start ()
-{
-    log "Starting affine_tasks. Reaffining tasks to platform cores..."
-    if [ ! -f ${INITIAL_CONFIG_COMPLETE_FLAG} ]; then
-        log "Initial Configuration incomplete. Skipping affining tasks."
-        exit 0
+# Log error message to /var/log/daemon.log
+function ERROR {
+    logger -s -p daemon.error -t "${NAME}($$): " "$@"
+}
+
+# Update cgroup k8s-infra cpuset and nodeset to span all non-isolated cpus.
+function update_cgroup_cpuset_k8s_infra_all {
+    # Set all cgroup cpuset and nodeset in tree hierarchy order.
+    # This will always work, no matter the previous cpuset state.
+    find ${CGDIR_K8S} -type d | \
+    while read d; do
+        /bin/echo ${ONLINE_NODES} > ${d}/cpuset.mems 2>/dev/null
+        /bin/echo ${NONISOL_CPUS} > ${d}/cpuset.cpus 2>/dev/null
+    done
+    LOG "Update ${CGDIR_K8S}," \
+        "ONLINE_NODES=${ONLINE_NODES}, NONISOL_CPUS=${NONISOL_CPUS}"
+}
+
+# Update cgroup k8s-infra to span platform cpuset and nodeset.
+function update_cgroup_cpuset_k8s_infra_platform {
+    # Clear any existing cpuset settings. This ensures that the
+    # subsequent shrink to platform cpuset will always work.
+    update_cgroup_cpuset_k8s_infra_all
+
+    # Set all cgroup cpuset and nodeset in depth-first order.
+    # NOTE: this only works if we are shrinking the cpuset.
+    find ${CGDIR_K8S} -depth -type d | \
+    while read d; do
+        /bin/echo ${PLATFORM_NODES} > ${d}/cpuset.mems 2>/dev/null
+        /bin/echo ${PLATFORM_CPUS}  > ${d}/cpuset.cpus 2>/dev/null
+    done
+    LOG "Update ${CGDIR_K8S}," \
+        "PLATFORM_NODES=${PLATFORM_NODES}, PLATFORM_CPUS=${PLATFORM_CPUS}"
+}
+
+# Check criteria for K8s platform ready on this node.
+# i.e., k8s-infra is configured, kubelet is running
+function is_k8s_platform_ready {
+    local PASS=0
+    local FAIL=1
+
+    # Global variable
+    NOT_READY_REASON=""
+
+    # Check that cgroup cpuset k8s-infra has been configured
+    if [ ! -e ${CGDIR_K8S} ]; then
+        NOT_READY_REASON="k8s-infra not configured"
+        return ${FAIL}
    fi
-    # TODO: Should revisit this since this leaves a few lingering floating
-    # tasks and does not really work with cgroup cpusets.
-    # Comment out for now. Cleanup required.
-    ##affine_tasks_to_platform_cores
-    ##[[ $? -eq 0 ]] && log "Tasks re-affining done." || log "Tasks re-affining failed."
+
+    # Check that kubelet is running and stable
+    if systemctl is-active kubelet --quiet; then
+        PID=$(systemctl show kubelet.service -p MainPID | \
+                awk -vFS='=' '{print $2}')
+        if [ ${PID} -eq 0 ]; then
+            NOT_READY_REASON="kubelet not running"
+            return ${FAIL}
+        fi
+        up=$(ps -p ${PID} -o etimes= 2>/dev/null | awk '{print $1}')
+        if ! { [ -n "${up}" -a ${up} -ge 30 ]; }
+        then
+            NOT_READY_REASON="kubelet not yet stable"
+            return ${FAIL}
+        fi
+    else
+        NOT_READY_REASON="kubelet not running"
+        return ${FAIL}
+    fi
+
+    LOG "kubelet is ready"
+    return ${PASS}
 }

-stop ()
-{
-    log "Stopping affine_tasks..."
+# Determine whether this node has 'static' cpu manager policy.
+# NOTE: This check assumes that kubelet is already running locally.
+function is_static_cpu_manager_policy {
+    local PASS=0
+    local FAIL=1
+
+    state=$(cat /var/lib/kubelet/cpu_manager_state 2>/dev/null)
+    if [[ $state =~ \"policyName\":.?\"static\" ]]; then
+        return ${PASS}
+    else
+        return ${FAIL}
+    fi
 }

-status()
-{
+# Check criteria for K8s platform steady-state ready on this node.
+# i.e., kube-system pods have recovered, kube application apply
+# has completed, nova-compute is running, cinder-volume is running.
+# NOTE: This function depends on kubectl commands, so is only
+# usable on controllers.
+function is_k8s_platform_steady_state_ready {
+    local PASS=0
+    local FAIL=1
+    local this_node=${HOSTNAME}
+
+    # Global variable
+    NOT_READY_REASON=""
+
+    # Check that kube-system pods have recovered on this node
+    npods=$(kubectl get pods --namespace kube-system --no-headers \
+            --field-selector spec.nodeName=${this_node} 2>/dev/null | \
+            awk '
+BEGIN { n=0; }
+!/Completed|Running/ { n+=1 }
+END { printf "%d\n", n; }
+')
+    if [ ${npods} -gt 0 ]; then
+        NOT_READY_REASON="${npods} kube-system pods not recovered"
+        STABLE=0
+        return ${FAIL}
+    fi
+
+    # Wait for a few critical openstack pods to be running if this is
+    # an openstack-compute-node. This is not an exhaustive list.
+    # Make sure that all openstack pods on this node are running.
+    labels=$(kubectl get node ${this_node} \
+                --no-headers --show-labels 2>/dev/null | awk '{print $NF}')
+    if [[ $labels =~ openstack-compute-node=enabled ]]; then
+        # nova-compute is one of the last charts to recover after reboot
+        PODS=( $(kubectl get pods --namespace openstack --no-headers \
+                --selector application=nova,component=compute \
+                --field-selector \
+                spec.nodeName=${this_node},status.phase=Running 2>/dev/null) )
+        if [ ${#PODS[@]} -eq 0 ]; then
+            NOT_READY_REASON="nova-compute pod not running"
+            STABLE=0
+            return ${FAIL}
+        fi
+
+        # cinder-volume is one of the last charts to recover after reboot
+        PODS=( $(kubectl get pods --namespace openstack --no-headers \
+               --selector application=cinder,component=volume \
+               --field-selector \
+               spec.nodeName=${this_node},status.phase=Running 2>/dev/null) )
+        if [ ${#PODS[@]} -eq 0 ]; then
+            NOT_READY_REASON="cinder-volume pod not running"
+            STABLE=0
+            return ${FAIL}
+        fi
+
+        # Check that all openstack pods on this node have recovered
+        npods=$(kubectl get pods --namespace openstack --no-headers \
+                --field-selector spec.nodeName=${this_node} 2>/dev/null | \
+                awk '
+BEGIN { n=0; }
+!/Completed|Running/ { n+=1 }
+END { printf "%d\n", n; }
+')
+        if [ ${npods} -gt 0 ]; then
+            NOT_READY_REASON="${npods} openstack pods not recovered"
+            STABLE=0
+            return ${FAIL}
+        fi
+    fi
+
+    # Evaluate elapsed time since check criteria pass
+    if [ ${STABLE} -eq 0 ]; then
+        STABLE=${SECONDS}
+    fi
+    dt=$(( ${SECONDS} - ${STABLE} ))
+    if [ ${dt} -lt ${STABILIZATION_SECONDS} ]; then
+        NOT_READY_REASON="stabilization wait"
+        return ${FAIL}
+    fi
+
+    LOG "K8S is ready"
+    return ${PASS}
+}
+
+# Return list of reaffineable pids. This includes all processes, but excludes
+# kernel threads, vSwitch, and anything in K8S or qemu/kvm.
+function reaffineable_pids {
+    local pids_excl
+    local pidlist
+
+    pids_excl=$(ps -eL -o pid=,comm= | \
+                awk -vORS=',' '/eal-intr-thread|kthreadd/ {print $1}' | \
+                sed 's/,$/\n/')
+    pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
+                -o pid=,cgroup= | \
+                awk '!/k8s-infra|machine.slice/ {print $1; }')
+    echo "${pidlist[@]}"
+}
+
+function affine_tasks_to_all_cores {
+    local pidlist
+    local count=0
+
+    LOG "Affine all tasks, CPUS: ${NONISOL_CPUS};" \
+        "online=${ONLINE_CPUS} (0x${ONLINE_MASK})," \
+        "isol=${ISOL_CPUS}, nonisol=${NONISOL_CPUS} (0x${NONISOL_MASK})"
+
+    pidlist=( $(reaffineable_pids) )
+    for pid in ${pidlist[@]}; do
+        count=$((${count} + 1))
+        taskset --all-tasks --pid --cpu-list \
+            ${NONISOL_CPUS} ${pid} > /dev/null 2>&1
+    done
+
+    LOG "Affined ${count} processes to all cores."
+}
+
+function affine_tasks_to_platform_cores {
+    local pidlist
+    local count=0
+
+    LOG "Affine all tasks, PLATFORM_CPUS=${PLATFORM_CPUS}"
+
+    pidlist=( $(reaffineable_pids) )
+    for pid in ${pidlist[@]}; do
+        pid_mask=$(taskset -p $pid 2> /dev/null | awk '{print $6}')
+        if [ "${pid_mask}" == "${NONISOL_MASK}" ]; then
+            count=$((${count} + 1))
+            taskset --all-tasks --pid --cpu-list \
+                ${PLATFORM_CPUS} ${pid} > /dev/null 2>&1
+        fi
+    done
+
+    # Reaffine vSwitch tasks that span multiple cpus to platform cpus
+    pidlist=$(ps -eL -o pid=,comm= | awk '/eal-intr-thread/ {print $1}')
+    for pid in ${pidlist[@]}; do
+        count=$((${count} + 1))
+        grep Cpus_allowed_list /proc/${pid}/task/*/status 2>/dev/null | \
+            sed 's#/# #g' | awk '/,|-/ {print $4}' | \
+            xargs --no-run-if-empty -i{} \
+            taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1
+    done
+
+    LOG "Affined ${count} processes to platform cores."
+}
+
+function start {
+    # Ensure this only runs on AIO
+    if ! { [[ "$nodetype" = "controller" ]] && [[ $subfunction = *worker* ]]; }
+    then
+        LOG "Not AIO, nothing to do."
+        return
+    fi
+
+    # Abort if another instantiation is already running
+    if [ -e ${PIDFILE} ]; then
+        PID=$(cat ${PIDFILE})
+        if [ -n "${PID}" -a -e /proc/${PID} ]; then
+            ERROR "Aborting, ${PID} already running: ${PIDFILE}."
+            exit 1
+        else
+            OUT=$(rm -v -f ${PIDFILE})
+            LOG "${OUT}"
+        fi
+    fi
+
+    LOG "Starting."
+
+    # Create pidfile to indicate the script is running
+    echo $$ > ${PIDFILE}
+
+    # Affine all tasks to float on all cores
+    affine_tasks_to_all_cores
+
+    # Wait for kubelet to be running
+    t0=${SECONDS}
+    until is_k8s_platform_ready; do
+        dt=$(( ${SECONDS} - ${t0} ))
+        if [ ${dt} -ge ${PRINT_INTERVAL_SECONDS} ]; then
+            t0=${SECONDS}
+            LOG "Recovery wait, elapsed ${SECONDS} seconds." \
+                "Reason: ${NOT_READY_REASON}"
+        fi
+        sleep ${INIT_INTERVAL_SECONDS}
+    done
+
+    # Update K8S cpuset so that pods float on all cpus
+    # NOTE: dynamic cpuset changes incompatible with static policy
+    if ! is_static_cpu_manager_policy; then
+        update_cgroup_cpuset_k8s_infra_all
+    fi
+
+    # Wait until K8s pods have recovered and nova-compute is running
+    t0=${SECONDS}
+    until is_k8s_platform_steady_state_ready; do
+        dt=$(( ${SECONDS} - ${t0} ))
+        if [ ${dt} -ge ${PRINT_INTERVAL_SECONDS} ]; then
+            t0=${SECONDS}
+            LOG "Recovery wait, elapsed ${SECONDS} seconds." \
+                "Reason: ${NOT_READY_REASON}"
+        fi
+        sleep ${CHECK_INTERVAL_SECONDS}
+    done
+
+    # Update K8S cpuset to platform cores
+    if ! is_static_cpu_manager_policy; then
+        update_cgroup_cpuset_k8s_infra_platform
+    fi
+
+    # Affine all floating tasks back to platform cores
+    affine_tasks_to_platform_cores
+
+    # Remove pidfile after successful completion
+    rm -f ${PIDFILE}
+
+    LOG "Complete."
+}
+
+function stop {
+    LOG "Stopping."
+
+    # Forcibly stop any running instantiation
+    if [ -e ${PIDFILE} ]; then
+        PID=$(cat ${PIDFILE})
+        if [ -n "${PID}" -a -e /proc/${PID} ]; then
+            LOG "Stopping ${PID}: ${PIDFILE}."
+            kill -9 ${PID}
+            timeout 20 tail --pid=${PID} -f /dev/null
+        fi
+        OUT=$(rm -v -f ${PIDFILE})
+        LOG "${OUT}"
+    fi
+}
+
+function status {
    :
 }

-reset()
-{
+function reset {
    :
 }

+if [ ${UID} -ne 0 ]; then
+    ERROR "Need sudo/root permission."
+    exit 1
+fi
+
 case "$1" in
    start)
        start
--- a/worker-utils/worker-utils/task_affinity_functions.sh
+++ b/worker-utils/worker-utils/task_affinity_functions.sh
@ -29,19 +29,22 @@ LOG_DEBUG=1
 TAG="TASKAFFINITY:"

 TASK_AFFINING_INCOMPLETE="/etc/platform/.task_affining_incomplete"
-N_CPUS=$(cat /proc/cpuinfo 2>/dev/null | \
-            awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}')
+N_CPUS=$(getconf _NPROCESSORS_ONLN)
 FULLSET_CPUS="0-"$((N_CPUS-1))
 FULLSET_MASK=$(cpulist_to_cpumap ${FULLSET_CPUS} ${N_CPUS})
-PLATFORM_CPUS=$(get_platform_cpu_list)
-PLATFORM_CPULIST=$(get_platform_cpu_list| \
+PLATFORM_CPUS=$(platform_expanded_cpu_list)
+PLATFORM_CPULIST=$(platform_expanded_cpu_list| \
                    perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
                    sed 's/,/ /g')
 VSWITCH_CPULIST=$(get_vswitch_cpu_list| \
                    perl -pe 's/(\d+)-(\d+)/join(",",$1..$2)/eg'| \
                    sed 's/,/ /g')
+if [[ $vswitch_type =~ none ]]; then
+    VSWITCH_CPULIST=""
+fi
+
 IDLE_MARK=95.0
-KERNEL=`uname -a`
+KERNEL=$(uname -a)

 ################################################################################
 # Check if a given core is one of the platform cores
@ -69,98 +72,19 @@ function is_vswitch_core {
    return 0
 }

-################################################################################
-# An audit and corrective action following a swact
-################################################################################
-function audit_and_reaffine {
-    local mask=$1
-    local cmd_str=""
-    local tasklist
-
-    cmd_str="ps-sched.sh|awk '(\$9==\"$mask\") {print \$2}'"
-
-    tasklist=($(eval $cmd_str))
-    # log_debug "cmd str = $cmd_str"
-    log_debug "${TAG} There are ${#tasklist[@]} tasks to reaffine."
-
-    for task in ${tasklist[@]}; do
-        taskset -acp ${PLATFORM_CPUS} $task &> /dev/null
-        rc=$?
-        [[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc"
-    done
-    tasklist=($(eval $cmd_str))
-    [[ ${#tasklist[@]} -eq 0 ]] && return 0 || return 1
-}
-
-################################################################################
-# The following function is used to verify that any sleeping management tasks
-# that are on non-platform cores can be migrated to platform cores as soon as
-# they are scheduled. It can be invoked either manually or from goenableCompute
-# script as a scheduled job (with a few minute delay) if desired.
-# The induced tasks migration should be done after all VMs have been restored
-# following a host reboot in AIO, hence the delay.
-################################################################################
-function move_inactive_threads_to_platform_cores {
-    local tasklist
-    local cmd_str=""
-
-    # Compile a list of non-kernel & non-vswitch/VM related threads that are not
-    # on platform cores.
-    # e.g. if the platform cpulist value is "0 8", the resulting command to be
-    # evaluated should look like this:
-    # ps-sched.sh|grep -v vswitch|awk '($10!=0 && $10!=8 && $3!=2) {if(NR>1)print $2}'
-    cmd_str="ps-sched.sh|grep -v vswitch|awk '("
-    for cpu_num in ${PLATFORM_CPULIST}; do
-        cmd_str=$cmd_str"\$10!="${cpu_num}" && "
-    done
-    cmd_str=$cmd_str"\$3!=2) {if(NR>1)print \$2}'"
-    echo "selection string = $cmd_str"
-    tasklist=($(eval $cmd_str))
-    log_debug "${TAG} There are ${#tasklist[@]} number of tasks to be moved."
-
-    # These sleep tasks are stuck on the wrong core(s). They need to be woken up
-    # so they can be migrated to the right ones. Attaching and detaching strace
-    # momentarily to the task does the trick.
-    for task in ${tasklist[@]}; do
-        strace -p $task 2>/dev/null &
-        pid=$!
-        sleep 0.1
-        kill -SIGINT $pid
-    done
-    tasklist=($(eval $cmd_str))
-    [[ ${#tasklist[@]} -eq 0 ]] && return 0 || return 1
-}
-
-################################################################################
-# The following function is called by affine-platform.sh to affine tasks to
-# all available cores during initial startup and subsequent host reboots.
-################################################################################
-function affine_tasks_to_all_cores {
+# Return list of reaffineable pids. This includes all processes, but excludes
+# kernel threads, vSwitch, and anything in K8S or qemu/kvm.
+function reaffineable_pids {
+    local pids_excl
    local pidlist
-    local rc=0

-    if [[ "${KERNEL}" == *" RT "* ]]; then
-        return 0
-    fi
-
-    log_debug "${TAG} Affining all tasks to CPU (${FULLSET_CPUS})"
-
-    pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }')
-    for pid in ${pidlist[@]}; do
-        ppid=$(ps -o ppid= -p $pid |tr -d '[:space:]')
-        if [ -z $ppid ] || [ $ppid -eq 2 ]; then
-            continue
-        fi
-        log_debug "Affining pid $pid, parent pid = $ppid"
-        taskset --all-tasks --pid --cpu-list ${FULLSET_CPUS} $pid &> /dev/null
-        rc=$?
-        [[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc"
-    done
-    # Write the cpu list to a temp file which will be read and removed when
-    # the tasks are reaffined back to platform cores later on.
-    echo ${FULLSET_CPUS} > ${TASK_AFFINING_INCOMPLETE}
-
-    return $rc
+    pids_excl=$(ps -eL -o pid=,comm= | \
+                awk -vORS=',' '/eal-intr-thread|kthreadd/ {print $1}' | \
+                sed 's/,$/\n/')
+    pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
+                -o pid=,cgroup= | \
+                awk '!/k8s-infra|machine.slice/ {print $1; }')
+    echo "${pidlist[@]}"
 }

 ################################################################################
@ -211,32 +135,22 @@ function affine_tasks_to_idle_cores {
            # Platform core is added to the idle list by default
            idle_cpulist=$idle_cpulist$cpu","
        else
-      # Non platform core is added to the idle list if it is more than 95% idle
-            [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]] && idle_cpulist=$idle_cpulist$cpu","
+            # Non platform core is added to the idle list if it is more
+            # than 95% idle
+            if [[ $(echo "$idle_value > ${IDLE_MARK}"|bc) -eq 1 ]]; then
+                idle_cpulist=$idle_cpulist$cpu","
+            fi
        fi
        cpu=$(($cpu+1))
    done

    idle_cpulist=$(echo $idle_cpulist|sed 's/.$//')
-    platform_affinity_mask=$(cpulist_to_cpumap ${PLATFORM_CPUS} ${N_CPUS} \
-                            |awk '{print tolower($0)}')

    log_debug "${TAG} Affining all tasks to idle CPU ($idle_cpulist)"
-
-    vswitch_pid=$(pgrep vswitch)
-    pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }')
+    pidlist=( $(reaffineable_pids) )
    for pid in ${pidlist[@]}; do
-        ppid=$(ps -o ppid= -p $pid |tr -d '[:space:]')
-        if [ -z $ppid ] || [ $ppid -eq 2 ] || [ "$pid" = "$vswitch_pid" ]; then
-            continue
-        fi
-        pid_affinity_mask=$(taskset -p $pid | awk '{print $6}')
-        if [ "${pid_affinity_mask}" == "${platform_affinity_mask}" ]; then
-            # log_debug "Affining pid $pid to idle cores..."
-            taskset --all-tasks --pid --cpu-list $idle_cpulist $pid &> /dev/null
-            rc=$?
-            [[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc"
-        fi
+        taskset --all-tasks --pid --cpu-list \
+            ${idle_cpulist} ${pid} > /dev/null 2>&1
    done

    # Save the cpu list to the temp file which will be read and removed when
@ -246,10 +160,7 @@ function affine_tasks_to_idle_cores {
 }

 ################################################################################
-# The following function is called by either:
-# a) nova-compute wrapper script during AIO system initial bringup or reboot
-# or
-# b) sm at the end of swact sequence
+# The following function is called by sm at the end of swact sequence
 # to re-affine management tasks back to the platform cores.
 ################################################################################
 function affine_tasks_to_platform_cores {
@ -259,42 +170,32 @@ function affine_tasks_to_platform_cores {
    local count=0

    if [ ! -f ${TASK_AFFINING_INCOMPLETE} ]; then
-        dbg_str="${TAG} Either tasks have never been affined to all/idle cores or"
-        dbg_str=$dbg_str" they have already been reaffined to platform cores."
+        dbg_str="${TAG} Either tasks have never been affined to all/idle"
+        dbg_str="${TAG} cores or they have already been reaffined to"
+        dbg_str="${TAG} platform cores."
        log_debug "$dbg_str"
        return 0
    fi

    read cpulist < ${TASK_AFFINING_INCOMPLETE}
-    affinity_mask=$(cpulist_to_cpumap $cpulist ${N_CPUS}|awk '{print tolower($0)}')

    log_debug "${TAG} Reaffining tasks to platform cores (${PLATFORM_CPUS})..."
-    pidlist=$(ps --ppid 2 -p 2 --deselect -o pid= | awk '{ print $1; }')
+    pidlist=( $(reaffineable_pids) )
    for pid in ${pidlist[@]}; do
-        # log_debug "Processing pid $pid..."
-        pid_affinity_mask=$(taskset -p $pid | awk '{print $6}')
-        # Only management tasks need to be reaffined. Kernel, vswitch and VM related
-        # tasks were not affined previously so they should have different affinity
-        # mask(s).
-        if [ "${pid_affinity_mask}" == "${affinity_mask}" ]; then
-            count=$(($count+1))
-            # log_debug "Affining pid $pid to platform cores..."
-            taskset --all-tasks --pid --cpu-list ${PLATFORM_CPUS} $pid &> /dev/null
-            rc=$?
-            [[ $rc -ne 0 ]] && log_error "Failed to set CPU affinity for pid $pid, rc=$rc"
-        fi
+        taskset --all-tasks --pid --cpu-list \
+            ${PLATFORM_CPUS} ${pid} > /dev/null 2>&1
    done

-    # A workaround for lack of "end of swact" state
-    fullmask=$(echo ${FULLSET_MASK} | awk '{print tolower($0)}')
-    if [ "${affinity_mask}" != "${fullmask}" ]; then
-        log_debug "${TAG} Schedule an audit and cleanup"
-        (sleep 60; audit_and_reaffine "0x"$affinity_mask) &
-    fi
+    # Reaffine vSwitch tasks that span multiple cpus to platform cpus
+    pidlist=$(ps -eL -o pid=,comm= | awk '/eal-intr-thread/ {print $1}')
+    for pid in ${pidlist[@]}; do
+        grep Cpus_allowed_list /proc/${pid}/task/*/status 2>/dev/null | \
+            sed 's#/# #g' | awk '/,|-/ {print $4}' | \
+            xargs --no-run-if-empty -i{} \
+            taskset --pid --cpu-list ${PLATFORM_CPUS} {} > /dev/null 2>&1
+    done

    rm -rf ${TASK_AFFINING_INCOMPLETE}
-    log_debug "${TAG} $count tasks were reaffined to platform cores."
-
    return $rc
 }