Correct K8S and docker affinity on AIO

This updates K8S cpuset affinity on AIO when kubernetes cpu
manager policy is configured to 'none' (i.e., the default setting)
and openstack-compute-node is not configured.

This update makes K8s pods float on all cores instead of being
reaffined to platform cores. This is done since kubernetes cannot
isolate kube-system platform versus application pods.

This update affines docker uwsgi tasks to platform cores.

Change-Id: Iee40f747025c9777f80a94fe96b7c90e91d017e6
Closes-bug: 1851569
Signed-off-by: Jim Gauld <james.gauld@windriver.com>
This commit is contained in:
Jim Gauld 2019-11-06 18:33:14 -05:00 committed by Al Bailey
parent 1f024e87ac
commit 01c7f51607

View File

@ -57,6 +57,7 @@ export KUBECONFIG=/etc/kubernetes/admin.conf
# Global parameters
CGDIR_K8S=/sys/fs/cgroup/cpuset/k8s-infra
CGDIR_DOCKER=/sys/fs/cgroup/cpuset/docker
INIT_INTERVAL_SECONDS=10
CHECK_INTERVAL_SECONDS=30
PRINT_INTERVAL_SECONDS=300
@ -97,6 +98,10 @@ PLATFORM_CPUS=$(platform_expanded_cpu_list)
NOT_READY_REASON=""
STABLE=0
# Set LOG_DEBUG to non-empty string to enable debug logs
LOG_DEBUG=""
# Log info message to /var/log/daemon.log
function LOG {
logger -p daemon.info -t "${NAME}($$): " "$@"
@ -107,33 +112,63 @@ function ERROR {
logger -s -p daemon.error -t "${NAME}($$): " "$@"
}
# Update cgroup k8s-infra cpuset and nodeset to span all non-isolated cpus.
function update_cgroup_cpuset_k8s_infra_all {
# Log debug message to /var/log/daemon.log if debug enabled via LOG_DEBUG
function DEBUG {
if [ ! -z "${LOG_DEBUG}" ]; then
logger -p daemon.debug -t "${NAME}($$): " "$@"
fi
}
# Update cgroup cpuset and nodeset to span all non-isolated cpus.
function update_cgroup_cpuset_all {
local CGDIR=$1
if [ ! -d "${CGDIR}" ]; then
ERROR "update_cgroup_cpuset_all: ${CGDIR} does not exist"
return
fi
# Set all cgroup cpuset and nodeset in tree hierarchy order.
# This will always work, no matter the previous cpuset state.
find ${CGDIR_K8S} -type d | \
find ${CGDIR} -type d | \
while read d; do
/bin/echo ${ONLINE_NODES} > ${d}/cpuset.mems 2>/dev/null
/bin/echo ${NONISOL_CPUS} > ${d}/cpuset.cpus 2>/dev/null
/bin/echo ${ONLINE_CPUS} > ${d}/cpuset.cpus 2>/dev/null
done
LOG "Update ${CGDIR_K8S}," \
# Set all cgroup cpuset in depth-first order.
# NOTE: this only works if we are shrinking the cpuset.
find ${CGDIR} -depth -type d | \
while read d; do
/bin/echo ${NONISOL_CPUS} > ${d}/cpuset.cpus 2>/dev/null
C=$(cat ${d}/cpuset.cpus 2>/dev/null)
DEBUG "update all: ${d}, cpuset.cpus=${C}"
done
LOG "Update ${CGDIR}," \
"ONLINE_NODES=${ONLINE_NODES}, NONISOL_CPUS=${NONISOL_CPUS}"
}
# Update cgroup k8s-infra to span platform cpuset and nodeset.
function update_cgroup_cpuset_k8s_infra_platform {
# Update cgroup cpuset to span platform cpuset and nodeset.
function update_cgroup_cpuset_platform {
local CGDIR=$1
if [ ! -d "${CGDIR}" ]; then
ERROR "update_cgroup_cpuset_platform: ${CGDIR} does not exist"
return
fi
# Clear any existing cpuset settings. This ensures that the
# subsequent shrink to platform cpuset will always work.
update_cgroup_cpuset_k8s_infra_all
update_cgroup_cpuset_all ${CGDIR}
# Set all cgroup cpuset and nodeset in depth-first order.
# NOTE: this only works if we are shrinking the cpuset.
find ${CGDIR_K8S} -depth -type d | \
find ${CGDIR} -depth -type d | \
while read d; do
/bin/echo ${PLATFORM_NODES} > ${d}/cpuset.mems 2>/dev/null
/bin/echo ${PLATFORM_CPUS} > ${d}/cpuset.cpus 2>/dev/null
C=$(cat ${d}/cpuset.cpus 2>/dev/null)
DEBUG "update platform: ${d}, cpuset.cpus=${C}"
done
LOG "Update ${CGDIR_K8S}," \
LOG "Update ${CGDIR}," \
"PLATFORM_NODES=${PLATFORM_NODES}, PLATFORM_CPUS=${PLATFORM_CPUS}"
}
@ -175,6 +210,26 @@ function is_k8s_platform_ready {
return ${PASS}
}
# Check criteria for docker platform ready on this node.
# i.e., docker is configured
function is_docker_platform_ready {
local PASS=0
local FAIL=1
# Global variable
NOT_READY_REASON=""
# Check that cgroup cpuset docker has been configured
if [ ! -e ${CGDIR_DOCKER} ]; then
NOT_READY_REASON="docker not configured"
return ${FAIL}
fi
LOG "docker is ready"
return ${PASS}
}
# Determine whether this node has 'static' cpu manager policy.
# NOTE: This check assumes that kubelet is already running locally.
function is_static_cpu_manager_policy {
@ -278,6 +333,22 @@ END { printf "%d\n", n; }
return ${PASS}
}
# Check whether this node is configured as openstack-compute-node.
function is_openstack_compute {
local PASS=0
local FAIL=1
# NOTE: hostname changes during first configuration
local this_node=$(cat /proc/sys/kernel/hostname)
labels=$(kubectl get node ${this_node} \
--no-headers --show-labels 2>/dev/null | awk '{print $NF}')
if [[ $labels =~ openstack-compute-node=enabled ]]; then
return ${PASS}
else
return ${FAIL}
fi
}
# Get number of DRBD resources started.
# Returns 0 if DRBD not ready.
function number_drbd_resources_started {
@ -333,7 +404,7 @@ function affine_drbd_tasks {
}
# Return list of reaffineable pids. This includes all processes, but excludes
# kernel threads, vSwitch, and anything in K8S or qemu/kvm.
# kernel threads, vSwitch, and anything in K8S, docker or qemu/kvm.
function reaffineable_pids {
local pids_excl
local pidlist
@ -343,7 +414,7 @@ function reaffineable_pids {
sed 's/,$/\n/')
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
-o pid=,cgroup= | \
awk '!/k8s-infra|machine.slice/ {print $1; }')
awk '!/k8s-infra|docker|machine.slice/ {print $1; }')
echo "${pidlist[@]}"
}
@ -440,7 +511,7 @@ function start {
# Update K8S cpuset so that pods float on all cpus
# NOTE: dynamic cpuset changes incompatible with static policy
if ! is_static_cpu_manager_policy; then
update_cgroup_cpuset_k8s_infra_all
update_cgroup_cpuset_all ${CGDIR_K8S}
fi
# Wait for all DRBD resources to have started. Affine DRBD tasks
@ -460,6 +531,12 @@ function start {
done
affine_drbd_tasks ${NONISOL_CPUS}
# Update docker cpuset so it floats on non-isolated cpus.
# The docker cgroup is not always created, so don't wait for it.
if is_docker_platform_ready -eq 0 ; then
update_cgroup_cpuset_all ${CGDIR_DOCKER}
fi
# Wait until core K8s pods have recovered and nova-compute is running
t0=${SECONDS}
until is_k8s_platform_steady_state_ready; do
@ -472,9 +549,19 @@ function start {
sleep ${CHECK_INTERVAL_SECONDS}
done
# Update docker cpuset to platform cores
# The docker cgroup is not always created, so don't wait for it.
if is_docker_platform_ready -eq 0 ; then
update_cgroup_cpuset_platform ${CGDIR_DOCKER}
else
LOG "Warning: ${CGDIR_DOCKER} not ready."
fi
# Update K8S cpuset to platform cores
if ! is_static_cpu_manager_policy; then
update_cgroup_cpuset_k8s_infra_platform
if is_openstack_compute; then
update_cgroup_cpuset_platform ${CGDIR_K8S}
fi
fi
# Affine all floating tasks back to platform cores