
In rare situations, we experience RCU preempt stall kernel event. System RCU stall is caused by kube-apiserver holding lock on a rt_mutex that other tasks in the system are waiting to aquire. During the hold, the kube-apiserver was also being preempted for significant periods of time. This results in service interruption to live traffic on affected nodes due to application crash and requiring power-cycle to recover server. This updates the kubeadm resource constants for kube-apiserver so that cpu limit is 4. This dramatically reduces the likeliness of RCU stall event. Changing this parameter does not address the root cause for the stall. This change does not affect the overall behaviour of apiserver, though its latency is improved. Test Plan: PASS: build-pkgs, build-image PASS: Install fresh ISO with K8S 1.29.2, 1.30.6, 1.31.5, 1.32.2. Verify kube-apiserver has cpu limit 4, e.g., on controller, verify we get the resource setting: sudo grep -A4 -e resources: /etc/kubernetes/manifests/kube-apiserver.yaml kubectl describe pod -n kube-system kube-apiserver-controller-0 | \ grep -A1 -e Limits: -e Requests: PASS: Manual run of following make tests make clean && make make test WHAT=./pkg/kubelet/cm GOFLAGS="-v" make test WHAT=./pkg/kubelet/cm/cpumanager GOFLAGS="-v" make test WHAT=./pkg/kubelet/cm/cpumanager/state GOFLAGS="-v" make test WHAT=./pkg/kubelet/cm/cpumanager/topology GOFLAGS="-v" make test WHAT=./pkg/kubelet/cm/topologymanager GOFLAGS="-v" make test WHAT=./pkg/kubelet/cm/devicemanager GOFLAGS="-v" make test WHAT=./pkg/kubelet/cm/memorymanager GOFLAGS="-v" make test WHAT=./pkg/kubelet/kuberuntime GOFLAGS="-v" make test WHAT=./cmd/kubeadm/app/constants GOFLAGS="-v" make test WHAT=./cmd/kubeadm/app/phases/controlplane GOFLAGS="-v" make test WHAT=./pkg/kubelet/cm GOFLAGS="-v" make test WHAT=./cmd/kubeadm/app/phases/addons/dns/ GOFLAGS="-v" make test-cmd WHAT=kubeadm GOFLAGS="-v" Closes-bug: 2110257 Change-Id: Ied070e7bd3172ab95e58a76885b83db1317d626b Signed-off-by: Jim Gauld <James.Gauld@windriver.com>
171 lines
8.7 KiB
Diff
171 lines
8.7 KiB
Diff
From 668dc57e7c06da9b29dd677648fdb198901332a1 Mon Sep 17 00:00:00 2001
|
|
From: Boovan Rajendran <boovan.rajendran@windriver.com>
|
|
Date: Mon, 26 Feb 2024 04:40:48 -0500
|
|
Subject: [PATCH] kubeadm: create platform pods with zero CPU resources
|
|
|
|
This specifies zero CPU resources when creating the manifests
|
|
for the static platform pods, as a workaround for the lack of
|
|
separate resource tracking for platform resources.
|
|
|
|
This specifies zero CPU and Memory resources for the coredns
|
|
deployment. manifests.go is the main source file for this,
|
|
not sure if the coredns.yaml are used but they are updated to
|
|
be consistent.
|
|
|
|
This specifies CPU limit of 4 for kube-apiserver pod so that it is
|
|
treated as a burstable QoS. This gives a boost of cgroup CPUShares
|
|
since the burstable cgroup parent has significantly more CPUShares
|
|
than best-effort on typical systems. This improves kube-apiserver
|
|
API responsiveness.
|
|
|
|
This increases kube-apiserver Readiness probe periodSeconds to 10
|
|
based on WRS/SS joint recommendation for minimum probe settings.
|
|
This reduces likelihood of kube-apiserver probe failure and
|
|
subsequent pod-restart under servere load. This also reduces CPU
|
|
demand.
|
|
|
|
Signed-off-by: Daniel Safta <daniel.safta@windriver.com>
|
|
Signed-off-by: Saba Touheed Mujawar <sabatouheed.mujawar@windriver.com>
|
|
Signed-off-by: Boovan Rajendran <boovan.rajendran@windriver.com>
|
|
Signed-off-by: Jim Gauld <James.Gauld@windriver.com>
|
|
---
|
|
cluster/addons/dns/coredns/coredns.yaml.base | 4 ++--
|
|
cluster/addons/dns/coredns/coredns.yaml.in | 4 ++--
|
|
cluster/addons/dns/coredns/coredns.yaml.sed | 4 ++--
|
|
cmd/kubeadm/app/phases/addons/dns/manifests.go | 4 ++--
|
|
.../app/phases/controlplane/manifests.go | 8 +++++---
|
|
cmd/kubeadm/app/util/staticpod/utils.go | 17 ++++++++++++++++-
|
|
6 files changed, 29 insertions(+), 12 deletions(-)
|
|
|
|
diff --git a/cluster/addons/dns/coredns/coredns.yaml.base b/cluster/addons/dns/coredns/coredns.yaml.base
|
|
index dd4570adb65..3a0fd7adb72 100644
|
|
--- a/cluster/addons/dns/coredns/coredns.yaml.base
|
|
+++ b/cluster/addons/dns/coredns/coredns.yaml.base
|
|
@@ -139,8 +139,8 @@ spec:
|
|
limits:
|
|
memory: __DNS__MEMORY__LIMIT__
|
|
requests:
|
|
- cpu: 100m
|
|
- memory: 70Mi
|
|
+ cpu: 0
|
|
+ memory: 0
|
|
args: [ "-conf", "/etc/coredns/Corefile" ]
|
|
volumeMounts:
|
|
- name: config-volume
|
|
diff --git a/cluster/addons/dns/coredns/coredns.yaml.in b/cluster/addons/dns/coredns/coredns.yaml.in
|
|
index 6939faec3f9..74b59584bc7 100644
|
|
--- a/cluster/addons/dns/coredns/coredns.yaml.in
|
|
+++ b/cluster/addons/dns/coredns/coredns.yaml.in
|
|
@@ -139,8 +139,8 @@ spec:
|
|
limits:
|
|
memory: 'dns_memory_limit'
|
|
requests:
|
|
- cpu: 100m
|
|
- memory: 70Mi
|
|
+ cpu: 0
|
|
+ memory: 0
|
|
args: [ "-conf", "/etc/coredns/Corefile" ]
|
|
volumeMounts:
|
|
- name: config-volume
|
|
diff --git a/cluster/addons/dns/coredns/coredns.yaml.sed b/cluster/addons/dns/coredns/coredns.yaml.sed
|
|
index a90f2b7674a..61afbecd9da 100644
|
|
--- a/cluster/addons/dns/coredns/coredns.yaml.sed
|
|
+++ b/cluster/addons/dns/coredns/coredns.yaml.sed
|
|
@@ -139,8 +139,8 @@ spec:
|
|
limits:
|
|
memory: $DNS_MEMORY_LIMIT
|
|
requests:
|
|
- cpu: 100m
|
|
- memory: 70Mi
|
|
+ cpu: 0
|
|
+ memory: 0
|
|
args: [ "-conf", "/etc/coredns/Corefile" ]
|
|
volumeMounts:
|
|
- name: config-volume
|
|
diff --git a/cmd/kubeadm/app/phases/addons/dns/manifests.go b/cmd/kubeadm/app/phases/addons/dns/manifests.go
|
|
index 905a2e050e6..2a2212d5d37 100644
|
|
--- a/cmd/kubeadm/app/phases/addons/dns/manifests.go
|
|
+++ b/cmd/kubeadm/app/phases/addons/dns/manifests.go
|
|
@@ -104,8 +104,8 @@ spec:
|
|
limits:
|
|
memory: 170Mi
|
|
requests:
|
|
- cpu: 100m
|
|
- memory: 70Mi
|
|
+ cpu: 0
|
|
+ memory: 0
|
|
args: [ "-conf", "/etc/coredns/Corefile" ]
|
|
volumeMounts:
|
|
- name: config-volume
|
|
diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go
|
|
index 998ca2e3456..7988f1fe918 100644
|
|
--- a/cmd/kubeadm/app/phases/controlplane/manifests.go
|
|
+++ b/cmd/kubeadm/app/phases/controlplane/manifests.go
|
|
@@ -66,7 +66,9 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
|
|
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", endpoint.BindPort, v1.URISchemeHTTPS),
|
|
ReadinessProbe: staticpodutil.ReadinessProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/readyz", endpoint.BindPort, v1.URISchemeHTTPS),
|
|
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetAPIServerProbeAddress(endpoint), "/livez", endpoint.BindPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
|
|
- Resources: staticpodutil.ComponentResources("250m"),
|
|
+ // WRS: Increase kube-apiserver cgroup CPUShares to improve API responsiveness;
|
|
+ // achieved by setting CPU Limits to make it burstable QoS.
|
|
+ Resources: staticpodutil.ComponentLimitResources("0", "4"),
|
|
Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.APIServer.ExtraEnvs),
|
|
}, mounts.GetVolumes(kubeadmconstants.KubeAPIServer),
|
|
map[string]string{kubeadmconstants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey: endpoint.String()}),
|
|
@@ -78,7 +80,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
|
|
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeControllerManager)),
|
|
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS),
|
|
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetControllerManagerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeControllerManagerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
|
|
- Resources: staticpodutil.ComponentResources("200m"),
|
|
+ Resources: staticpodutil.ComponentResources("0"),
|
|
Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.ControllerManager.ExtraEnvs),
|
|
}, mounts.GetVolumes(kubeadmconstants.KubeControllerManager), nil),
|
|
kubeadmconstants.KubeScheduler: staticpodutil.ComponentPod(v1.Container{
|
|
@@ -89,7 +91,7 @@ func GetStaticPodSpecs(cfg *kubeadmapi.ClusterConfiguration, endpoint *kubeadmap
|
|
VolumeMounts: staticpodutil.VolumeMountMapToSlice(mounts.GetVolumeMounts(kubeadmconstants.KubeScheduler)),
|
|
LivenessProbe: staticpodutil.LivenessProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS),
|
|
StartupProbe: staticpodutil.StartupProbe(staticpodutil.GetSchedulerProbeAddress(cfg), "/healthz", kubeadmconstants.KubeSchedulerPort, v1.URISchemeHTTPS, cfg.APIServer.TimeoutForControlPlane),
|
|
- Resources: staticpodutil.ComponentResources("100m"),
|
|
+ Resources: staticpodutil.ComponentResources("0"),
|
|
Env: kubeadmutil.MergeKubeadmEnvVars(proxyEnvs, cfg.Scheduler.ExtraEnvs),
|
|
}, mounts.GetVolumes(kubeadmconstants.KubeScheduler), nil),
|
|
}
|
|
diff --git a/cmd/kubeadm/app/util/staticpod/utils.go b/cmd/kubeadm/app/util/staticpod/utils.go
|
|
index ea2b13f4b16..6f9afebf348 100644
|
|
--- a/cmd/kubeadm/app/util/staticpod/utils.go
|
|
+++ b/cmd/kubeadm/app/util/staticpod/utils.go
|
|
@@ -99,6 +99,18 @@ func ComponentResources(cpu string) v1.ResourceRequirements {
|
|
}
|
|
}
|
|
|
|
+// ComponentLimitResources returns the v1.ResourceRequirements object needed for allocating a specified amount of the CPU with Limits
|
|
+func ComponentLimitResources(cpu string, lcpu string) v1.ResourceRequirements {
|
|
+ return v1.ResourceRequirements{
|
|
+ Requests: v1.ResourceList{
|
|
+ v1.ResourceCPU: resource.MustParse(cpu),
|
|
+ },
|
|
+ Limits: v1.ResourceList{
|
|
+ v1.ResourceCPU: resource.MustParse(lcpu),
|
|
+ },
|
|
+ }
|
|
+}
|
|
+
|
|
// NewVolume creates a v1.Volume with a hostPath mount to the specified location
|
|
func NewVolume(name, path string, pathType *v1.HostPathType) v1.Volume {
|
|
return v1.Volume{
|
|
@@ -255,7 +267,10 @@ func LivenessProbe(host, path string, port int32, scheme v1.URIScheme) *v1.Probe
|
|
func ReadinessProbe(host, path string, port int32, scheme v1.URIScheme) *v1.Probe {
|
|
// sets initialDelaySeconds as '0' because we don't want to delay user infrastructure checks
|
|
// looking for "ready" status on kubeadm static Pods
|
|
- return createHTTPProbe(host, path, port, scheme, 0, 15, 3, 1)
|
|
+ // WRS/SS joint recommendation: All pods probes should have following minimum probe
|
|
+ // settings unless required by the service (initialDelaySecond 0, periodSeconds 10,
|
|
+ // timeoutSeconds 5, successThreshold 1, failureThreshold 3)
|
|
+ return createHTTPProbe(host, path, port, scheme, 0, 15, 3, 10)
|
|
}
|
|
|
|
// StartupProbe creates a Probe object with a HTTPGet handler
|
|
--
|
|
2.25.1
|
|
|