Correct K8S and docker affinity on AIO
This updates K8S cpuset affinity on AIO when kubernetes cpu manager policy is configured to 'none' (i.e., the default setting) and openstack-compute-node is not configured. This update makes K8s pods float on all cores instead of being reaffined to platform cores. This is done since kubernetes cannot isolate kube-system platform versus application pods. This update affines docker uwsgi tasks to platform cores. Change-Id: Iee40f747025c9777f80a94fe96b7c90e91d017e6 Closes-bug: 1851569 Signed-off-by: Jim Gauld <james.gauld@windriver.com>
This commit is contained in:
parent
1f024e87ac
commit
01c7f51607
@ -57,6 +57,7 @@ export KUBECONFIG=/etc/kubernetes/admin.conf
|
|||||||
|
|
||||||
# Global parameters
|
# Global parameters
|
||||||
CGDIR_K8S=/sys/fs/cgroup/cpuset/k8s-infra
|
CGDIR_K8S=/sys/fs/cgroup/cpuset/k8s-infra
|
||||||
|
CGDIR_DOCKER=/sys/fs/cgroup/cpuset/docker
|
||||||
INIT_INTERVAL_SECONDS=10
|
INIT_INTERVAL_SECONDS=10
|
||||||
CHECK_INTERVAL_SECONDS=30
|
CHECK_INTERVAL_SECONDS=30
|
||||||
PRINT_INTERVAL_SECONDS=300
|
PRINT_INTERVAL_SECONDS=300
|
||||||
@ -97,6 +98,10 @@ PLATFORM_CPUS=$(platform_expanded_cpu_list)
|
|||||||
NOT_READY_REASON=""
|
NOT_READY_REASON=""
|
||||||
STABLE=0
|
STABLE=0
|
||||||
|
|
||||||
|
# Set LOG_DEBUG to non-empty string to enable debug logs
|
||||||
|
LOG_DEBUG=""
|
||||||
|
|
||||||
|
|
||||||
# Log info message to /var/log/daemon.log
|
# Log info message to /var/log/daemon.log
|
||||||
function LOG {
|
function LOG {
|
||||||
logger -p daemon.info -t "${NAME}($$): " "$@"
|
logger -p daemon.info -t "${NAME}($$): " "$@"
|
||||||
@ -107,33 +112,63 @@ function ERROR {
|
|||||||
logger -s -p daemon.error -t "${NAME}($$): " "$@"
|
logger -s -p daemon.error -t "${NAME}($$): " "$@"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Update cgroup k8s-infra cpuset and nodeset to span all non-isolated cpus.
|
# Log debug message to /var/log/daemon.log if debug enabled via LOG_DEBUG
|
||||||
function update_cgroup_cpuset_k8s_infra_all {
|
function DEBUG {
|
||||||
|
if [ ! -z "${LOG_DEBUG}" ]; then
|
||||||
|
logger -p daemon.debug -t "${NAME}($$): " "$@"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Update cgroup cpuset and nodeset to span all non-isolated cpus.
|
||||||
|
function update_cgroup_cpuset_all {
|
||||||
|
local CGDIR=$1
|
||||||
|
if [ ! -d "${CGDIR}" ]; then
|
||||||
|
ERROR "update_cgroup_cpuset_all: ${CGDIR} does not exist"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
# Set all cgroup cpuset and nodeset in tree hierarchy order.
|
# Set all cgroup cpuset and nodeset in tree hierarchy order.
|
||||||
# This will always work, no matter the previous cpuset state.
|
# This will always work, no matter the previous cpuset state.
|
||||||
find ${CGDIR_K8S} -type d | \
|
find ${CGDIR} -type d | \
|
||||||
while read d; do
|
while read d; do
|
||||||
/bin/echo ${ONLINE_NODES} > ${d}/cpuset.mems 2>/dev/null
|
/bin/echo ${ONLINE_NODES} > ${d}/cpuset.mems 2>/dev/null
|
||||||
/bin/echo ${NONISOL_CPUS} > ${d}/cpuset.cpus 2>/dev/null
|
/bin/echo ${ONLINE_CPUS} > ${d}/cpuset.cpus 2>/dev/null
|
||||||
done
|
done
|
||||||
LOG "Update ${CGDIR_K8S}," \
|
|
||||||
|
# Set all cgroup cpuset in depth-first order.
|
||||||
|
# NOTE: this only works if we are shrinking the cpuset.
|
||||||
|
find ${CGDIR} -depth -type d | \
|
||||||
|
while read d; do
|
||||||
|
/bin/echo ${NONISOL_CPUS} > ${d}/cpuset.cpus 2>/dev/null
|
||||||
|
C=$(cat ${d}/cpuset.cpus 2>/dev/null)
|
||||||
|
DEBUG "update all: ${d}, cpuset.cpus=${C}"
|
||||||
|
done
|
||||||
|
LOG "Update ${CGDIR}," \
|
||||||
"ONLINE_NODES=${ONLINE_NODES}, NONISOL_CPUS=${NONISOL_CPUS}"
|
"ONLINE_NODES=${ONLINE_NODES}, NONISOL_CPUS=${NONISOL_CPUS}"
|
||||||
}
|
}
|
||||||
|
|
||||||
# Update cgroup k8s-infra to span platform cpuset and nodeset.
|
# Update cgroup cpuset to span platform cpuset and nodeset.
|
||||||
function update_cgroup_cpuset_k8s_infra_platform {
|
function update_cgroup_cpuset_platform {
|
||||||
|
local CGDIR=$1
|
||||||
|
if [ ! -d "${CGDIR}" ]; then
|
||||||
|
ERROR "update_cgroup_cpuset_platform: ${CGDIR} does not exist"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
# Clear any existing cpuset settings. This ensures that the
|
# Clear any existing cpuset settings. This ensures that the
|
||||||
# subsequent shrink to platform cpuset will always work.
|
# subsequent shrink to platform cpuset will always work.
|
||||||
update_cgroup_cpuset_k8s_infra_all
|
update_cgroup_cpuset_all ${CGDIR}
|
||||||
|
|
||||||
# Set all cgroup cpuset and nodeset in depth-first order.
|
# Set all cgroup cpuset and nodeset in depth-first order.
|
||||||
# NOTE: this only works if we are shrinking the cpuset.
|
# NOTE: this only works if we are shrinking the cpuset.
|
||||||
find ${CGDIR_K8S} -depth -type d | \
|
find ${CGDIR} -depth -type d | \
|
||||||
while read d; do
|
while read d; do
|
||||||
/bin/echo ${PLATFORM_NODES} > ${d}/cpuset.mems 2>/dev/null
|
/bin/echo ${PLATFORM_NODES} > ${d}/cpuset.mems 2>/dev/null
|
||||||
/bin/echo ${PLATFORM_CPUS} > ${d}/cpuset.cpus 2>/dev/null
|
/bin/echo ${PLATFORM_CPUS} > ${d}/cpuset.cpus 2>/dev/null
|
||||||
|
C=$(cat ${d}/cpuset.cpus 2>/dev/null)
|
||||||
|
DEBUG "update platform: ${d}, cpuset.cpus=${C}"
|
||||||
done
|
done
|
||||||
LOG "Update ${CGDIR_K8S}," \
|
LOG "Update ${CGDIR}," \
|
||||||
"PLATFORM_NODES=${PLATFORM_NODES}, PLATFORM_CPUS=${PLATFORM_CPUS}"
|
"PLATFORM_NODES=${PLATFORM_NODES}, PLATFORM_CPUS=${PLATFORM_CPUS}"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -175,6 +210,26 @@ function is_k8s_platform_ready {
|
|||||||
return ${PASS}
|
return ${PASS}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Check criteria for docker platform ready on this node.
|
||||||
|
# i.e., docker is configured
|
||||||
|
function is_docker_platform_ready {
|
||||||
|
local PASS=0
|
||||||
|
local FAIL=1
|
||||||
|
|
||||||
|
# Global variable
|
||||||
|
NOT_READY_REASON=""
|
||||||
|
|
||||||
|
# Check that cgroup cpuset docker has been configured
|
||||||
|
if [ ! -e ${CGDIR_DOCKER} ]; then
|
||||||
|
NOT_READY_REASON="docker not configured"
|
||||||
|
return ${FAIL}
|
||||||
|
fi
|
||||||
|
|
||||||
|
LOG "docker is ready"
|
||||||
|
return ${PASS}
|
||||||
|
}
|
||||||
|
|
||||||
# Determine whether this node has 'static' cpu manager policy.
|
# Determine whether this node has 'static' cpu manager policy.
|
||||||
# NOTE: This check assumes that kubelet is already running locally.
|
# NOTE: This check assumes that kubelet is already running locally.
|
||||||
function is_static_cpu_manager_policy {
|
function is_static_cpu_manager_policy {
|
||||||
@ -278,6 +333,22 @@ END { printf "%d\n", n; }
|
|||||||
return ${PASS}
|
return ${PASS}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Check whether this node is configured as openstack-compute-node.
|
||||||
|
function is_openstack_compute {
|
||||||
|
local PASS=0
|
||||||
|
local FAIL=1
|
||||||
|
# NOTE: hostname changes during first configuration
|
||||||
|
local this_node=$(cat /proc/sys/kernel/hostname)
|
||||||
|
|
||||||
|
labels=$(kubectl get node ${this_node} \
|
||||||
|
--no-headers --show-labels 2>/dev/null | awk '{print $NF}')
|
||||||
|
if [[ $labels =~ openstack-compute-node=enabled ]]; then
|
||||||
|
return ${PASS}
|
||||||
|
else
|
||||||
|
return ${FAIL}
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# Get number of DRBD resources started.
|
# Get number of DRBD resources started.
|
||||||
# Returns 0 if DRBD not ready.
|
# Returns 0 if DRBD not ready.
|
||||||
function number_drbd_resources_started {
|
function number_drbd_resources_started {
|
||||||
@ -333,7 +404,7 @@ function affine_drbd_tasks {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Return list of reaffineable pids. This includes all processes, but excludes
|
# Return list of reaffineable pids. This includes all processes, but excludes
|
||||||
# kernel threads, vSwitch, and anything in K8S or qemu/kvm.
|
# kernel threads, vSwitch, and anything in K8S, docker or qemu/kvm.
|
||||||
function reaffineable_pids {
|
function reaffineable_pids {
|
||||||
local pids_excl
|
local pids_excl
|
||||||
local pidlist
|
local pidlist
|
||||||
@ -343,7 +414,7 @@ function reaffineable_pids {
|
|||||||
sed 's/,$/\n/')
|
sed 's/,$/\n/')
|
||||||
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
|
pidlist=$(ps --ppid ${pids_excl} -p ${pids_excl} --deselect \
|
||||||
-o pid=,cgroup= | \
|
-o pid=,cgroup= | \
|
||||||
awk '!/k8s-infra|machine.slice/ {print $1; }')
|
awk '!/k8s-infra|docker|machine.slice/ {print $1; }')
|
||||||
echo "${pidlist[@]}"
|
echo "${pidlist[@]}"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -440,7 +511,7 @@ function start {
|
|||||||
# Update K8S cpuset so that pods float on all cpus
|
# Update K8S cpuset so that pods float on all cpus
|
||||||
# NOTE: dynamic cpuset changes incompatible with static policy
|
# NOTE: dynamic cpuset changes incompatible with static policy
|
||||||
if ! is_static_cpu_manager_policy; then
|
if ! is_static_cpu_manager_policy; then
|
||||||
update_cgroup_cpuset_k8s_infra_all
|
update_cgroup_cpuset_all ${CGDIR_K8S}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Wait for all DRBD resources to have started. Affine DRBD tasks
|
# Wait for all DRBD resources to have started. Affine DRBD tasks
|
||||||
@ -460,6 +531,12 @@ function start {
|
|||||||
done
|
done
|
||||||
affine_drbd_tasks ${NONISOL_CPUS}
|
affine_drbd_tasks ${NONISOL_CPUS}
|
||||||
|
|
||||||
|
# Update docker cpuset so it floats on non-isolated cpus.
|
||||||
|
# The docker cgroup is not always created, so don't wait for it.
|
||||||
|
if is_docker_platform_ready -eq 0 ; then
|
||||||
|
update_cgroup_cpuset_all ${CGDIR_DOCKER}
|
||||||
|
fi
|
||||||
|
|
||||||
# Wait until core K8s pods have recovered and nova-compute is running
|
# Wait until core K8s pods have recovered and nova-compute is running
|
||||||
t0=${SECONDS}
|
t0=${SECONDS}
|
||||||
until is_k8s_platform_steady_state_ready; do
|
until is_k8s_platform_steady_state_ready; do
|
||||||
@ -472,9 +549,19 @@ function start {
|
|||||||
sleep ${CHECK_INTERVAL_SECONDS}
|
sleep ${CHECK_INTERVAL_SECONDS}
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Update docker cpuset to platform cores
|
||||||
|
# The docker cgroup is not always created, so don't wait for it.
|
||||||
|
if is_docker_platform_ready -eq 0 ; then
|
||||||
|
update_cgroup_cpuset_platform ${CGDIR_DOCKER}
|
||||||
|
else
|
||||||
|
LOG "Warning: ${CGDIR_DOCKER} not ready."
|
||||||
|
fi
|
||||||
|
|
||||||
# Update K8S cpuset to platform cores
|
# Update K8S cpuset to platform cores
|
||||||
if ! is_static_cpu_manager_policy; then
|
if ! is_static_cpu_manager_policy; then
|
||||||
update_cgroup_cpuset_k8s_infra_platform
|
if is_openstack_compute; then
|
||||||
|
update_cgroup_cpuset_platform ${CGDIR_K8S}
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Affine all floating tasks back to platform cores
|
# Affine all floating tasks back to platform cores
|
||||||
|
Loading…
Reference in New Issue
Block a user